import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import re
import numpy
from random import randrange
from pandas import Series
from matplotlib import pyplot
import calmap
from statsmodels.tsa.seasonal import seasonal_decompose
#from neuralprophet import NeuralProphet
import warnings
#import pmdarima as pm
from datetime import datetime
from datetime import timedelta
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from time import time
import seaborn as sns
import statsmodels.api as sm
import numpy as np
from sklearn.cluster import SpectralClustering
#from spectralcluster import SpectralClusterer
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy import stats
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
Using glob we scan the directory of a local folder, getting all files in the process. Using regex we filter out any file type that isn't .csv. We name all files apropriately, read the file and cast them to the appropriate variables.
## Find all .csv in specified directory
# directory path
dir = r"project_datasets\\"
# glob finds and puts all the file paths into a list
all_files = glob.glob(dir + "*.csv")
## Using RegEx take the name and format it (Format => "df_A")
pattern = re.compile(r'(?<=\\)(.*?)(?=\.)')
def get_file_names(x):
files = x
saved = []
for file in files:
for name in re.findall(pattern, file):
name = name.replace("data","df")
## .lower() could be removed it's just that our existing document uses lower case (df_a, df_d)
name = name.lower()
saved.append(name)
return saved
## Read all the file name and paths and assign them to variable
dataframe_names = get_file_names(all_files)
for index, item in enumerate(dataframe_names):
globals()['%s' % item] = pd.read_csv(all_files[index])
print("Available Dataframes:")
print(dataframe_names)
Available Dataframes: ['df_a', 'df_aa', 'df_ab', 'df_ac', 'df_ad', 'df_b', 'df_c', 'df_d', 'df_e', 'df_f', 'df_g', 'df_h', 'df_i', 'df_j', 'df_k', 'df_m', 'df_n', 'df_o', 'df_p', 'df_q', 'df_r', 'df_s', 'df_t', 'df_u', 'df_v', 'df_w', 'df_x', 'df_y', 'df_z']
We convert the EventDt column in each avaiable dataframe to DateTime format, after which we set it as an index and then we resmaple the data on a 5 minute frequency. Addtionally we add a daily, weekly and monthly variant of each dataframe that assist us in the data analysis at a later point in this notebook.
## Convert all column types to date_time and transform dataframes to time-series
for item in dataframe_names:
globals()['%s' % item]["EventDt"] = pd.to_datetime(globals()['%s' % item]["EventDt"])
globals()['%s' % item] = globals()['%s' % item].set_index(globals()['%s' % item]["EventDt"])
globals()['%s' % item] = globals()['%s' % item].resample(rule="5T").mean()
df_a.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315647 entries, 2018-11-18 16:10:00 to 2021-11-18 16:00:00 Freq: 5T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 258390 non-null float64 dtypes: float64(1) memory usage: 4.8 MB
## Added additional dataframes with resampled data in different formats (daily, weekly, monthly)
for item in dataframe_names:
globals()['%s_daily' % item] = globals()['%s' % item].resample(rule="D").mean()
globals()['%s_weekly' % item] = globals()['%s' % item].resample(rule="W").mean()
globals()['%s_monthly' % item] = globals()['%s' % item].resample(rule="M").mean()
Before we went any farther with our research, we wanted to see what each gadget was doing and if it was being read correctly.
#pic size
sns.set(rc={'figure.figsize':(11.7,8.27)})
#fix the date and maybe zoom in or smt
data = [df_a,df_aa,df_ab,df_ac,df_ad,df_b,df_c,df_d,df_e,df_f,df_g,df_h,df_i,df_j,
df_k,df_m,df_n,df_o,df_p,df_q,df_r,df_s,df_t,df_u,df_v,df_w,df_x,df_y,df_z]
fig, ax = plt.subplots()
for dataset in data:
sns.lineplot(x='EventDt', y='Temp', data=dataset, ax=ax) #first dataset
plt.title("Linear graph")
Text(0.5, 1.0, 'Linear graph')
As can be observed, the representation of each sensor is a jumble, making it difficult to understand what is going on within. To approach the analysis through this research, we will first compile average temperatures for each device to evaluate where they stand from freezing to warming.
To cut-down on load times, instead of plotting each dataframe values directly, we create a dictionary with the names and average temperature for each of our devices. Said dictionary is later plotted using SeaBorn barplot.
average_temps = []
for item in dataframe_names:
# mean_temp = globals()['%s' % item]["Temp"].mean()
mean_temp = globals()['%s' % item]["Temp"].quantile(.35)
average_temps.append(mean_temp)
mean_temp_dev_dictionary = {"Device":dataframe_names,"Mean Temperature":average_temps}
mean_temp_dev = pd.DataFrame.from_dict(mean_temp_dev_dictionary)
mean_temp_dev.T
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Device | df_a | df_aa | df_ab | df_ac | df_ad | df_b | df_c | df_d | df_e | df_f | ... | df_q | df_r | df_s | df_t | df_u | df_v | df_w | df_x | df_y | df_z |
| Mean Temperature | 17.33 | -21.0 | 18.83 | 21.87 | 19.41 | 5.55 | 14.15 | 16.84 | -24.13 | 21.75 | ... | 18.89 | 4.53 | 0.96 | 13.0 | 19.53 | 4.14 | 6.36 | 5.19 | -21.1 | 5.54 |
2 rows × 29 columns
We opted to approach it through visuals from the coldest to the warmest temperature per device to see where each of the standing and categorize it after the average temperature conversion.
#sns.set_theme(style="whitegrid")
#sns.set_palette(sns.color_pallete("icefire"),as_cmap=True)
sns.set(rc={'figure.figsize':(18,8)})
sns.barplot(x="Device",
y="Mean Temperature",
data=mean_temp_dev,
order=mean_temp_dev.sort_values('Mean Temperature').Device,
palette = "coolwarm").set(title='Categories')
plt.show()
After plotting the dictionary we can easily see the average temperature for each dataframe and compare them to one another. This will help us with categorizing and labeling our dataframes.
After careful consideration and discussion we decided to seperate our dataframes into three distinct categories: Freezer, Fridge, Pantry
#barchart per category
#check the order
mean_temp_dev.sort_values(by='Mean Temperature',ascending=True).T
| 11 | 14 | 8 | 27 | 1 | 21 | 10 | 24 | 15 | 20 | ... | 16 | 0 | 13 | 2 | 19 | 4 | 17 | 23 | 9 | 3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Device | df_h | df_k | df_e | df_y | df_aa | df_s | df_g | df_v | df_m | df_r | ... | df_n | df_a | df_j | df_ab | df_q | df_ad | df_o | df_u | df_f | df_ac |
| Mean Temperature | -27.33 | -26.97 | -24.13 | -21.1 | -21.0 | 0.96 | 1.91 | 4.14 | 4.37 | 4.53 | ... | 17.24 | 17.33 | 18.83 | 18.83 | 18.89 | 19.41 | 19.43 | 19.53 | 21.75 | 21.87 |
2 rows × 29 columns
#ax=mean_temp_dev.plot.bar(x='Device',
# y='Mean Temperature',
#figsize=(11,7),
# )
#label for first visual
#ax.bar_label(ax.containers[0], label_type='edge', fontweight='bold',padding=5)
#ax.margins(y=0.1)
Before moving further with the research, there are a few categories per device that can be followed, including the device's category and customer type; the next step will focus on that.
#categories
categories = []
customer_types = [
"",
"Food counter",
"",
"Medical facility",
"",
"Pharmacy",
"",
"Food production",
"Food production",
"Food counter",
"Food transport",
"Food production",
"",
"",
"Lab",
"Medical facility",
"",
"",
"Medical facility",
"Food production",
"Food counter",
"Food production",
"",
"",
"Medical facility",
"Pharmacy",
"Pharmacy",
"Lab",
"Pharmacy",
]
for value in mean_temp_dev['Mean Temperature']:
if value < 0:
categories.append('Freezer')
elif 0 <= value <= 8:
categories.append('Refrigerator')
elif 14 <= value < 22:
categories.append('Mapping')
else:
categories.append('Unknown')
mean_temp_dev['Categories'] = categories
mean_temp_dev["Customer type"] = customer_types
mean_temp_dev.T
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Device | df_a | df_aa | df_ab | df_ac | df_ad | df_b | df_c | df_d | df_e | df_f | ... | df_q | df_r | df_s | df_t | df_u | df_v | df_w | df_x | df_y | df_z |
| Mean Temperature | 17.33 | -21.0 | 18.83 | 21.87 | 19.41 | 5.55 | 14.15 | 16.84 | -24.13 | 21.75 | ... | 18.89 | 4.53 | 0.96 | 13.0 | 19.53 | 4.14 | 6.36 | 5.19 | -21.1 | 5.54 |
| Categories | Mapping | Freezer | Mapping | Mapping | Mapping | Refrigerator | Mapping | Mapping | Freezer | Mapping | ... | Mapping | Refrigerator | Refrigerator | Unknown | Mapping | Refrigerator | Refrigerator | Refrigerator | Freezer | Refrigerator |
| Customer type | Food counter | Medical facility | Pharmacy | Food production | Food production | Food counter | ... | Food production | Food counter | Food production | Medical facility | Pharmacy | Pharmacy | Lab | Pharmacy |
4 rows × 29 columns
#warning message
warnings.filterwarnings("ignore")
mean_temp_dev['Categories'] = mean_temp_dev['Categories'].apply(lambda x: x.strip())
# Modify Categories based on Customer Type
for x in range(len(customer_types)):
if mean_temp_dev['Categories'][x] == "Freezer":
## Code for Freezer
pass
elif mean_temp_dev['Categories'][x] == "Refrigerator":
## Code for fridge
pass
elif mean_temp_dev['Categories'][x] == "Mapping":
## Code for Mapping
if mean_temp_dev['Customer type'][x] == "Medical facility":
mean_temp_dev['Categories'][x] = "Incubator"
elif mean_temp_dev['Customer type'][x] == "Food production":
if mean_temp_dev['Mean Temperature'][x] > 17:
mean_temp_dev['Categories'][x] = "Production area"
else:
mean_temp_dev['Categories'][x] = "Storage"
mean_temp_dev
| Device | Mean Temperature | Categories | Customer type | |
|---|---|---|---|---|
| 0 | df_a | 17.33 | Mapping | |
| 1 | df_aa | -21.00 | Freezer | Food counter |
| 2 | df_ab | 18.83 | Mapping | |
| 3 | df_ac | 21.87 | Incubator | Medical facility |
| 4 | df_ad | 19.41 | Mapping | |
| 5 | df_b | 5.55 | Refrigerator | Pharmacy |
| 6 | df_c | 14.15 | Mapping | |
| 7 | df_d | 16.84 | Storage | Food production |
| 8 | df_e | -24.13 | Freezer | Food production |
| 9 | df_f | 21.75 | Mapping | Food counter |
| 10 | df_g | 1.91 | Refrigerator | Food transport |
| 11 | df_h | -27.33 | Freezer | Food production |
| 12 | df_i | 15.40 | Mapping | |
| 13 | df_j | 18.83 | Mapping | |
| 14 | df_k | -26.97 | Freezer | Lab |
| 15 | df_m | 4.37 | Refrigerator | Medical facility |
| 16 | df_n | 17.24 | Mapping | |
| 17 | df_o | 19.43 | Mapping | |
| 18 | df_p | 5.24 | Refrigerator | Medical facility |
| 19 | df_q | 18.89 | Production area | Food production |
| 20 | df_r | 4.53 | Refrigerator | Food counter |
| 21 | df_s | 0.96 | Refrigerator | Food production |
| 22 | df_t | 13.00 | Unknown | |
| 23 | df_u | 19.53 | Mapping | |
| 24 | df_v | 4.14 | Refrigerator | Medical facility |
| 25 | df_w | 6.36 | Refrigerator | Pharmacy |
| 26 | df_x | 5.19 | Refrigerator | Pharmacy |
| 27 | df_y | -21.10 | Freezer | Lab |
| 28 | df_z | 5.54 | Refrigerator | Pharmacy |
#barchart per group
plot = sns.barplot(data=mean_temp_dev,
x="Categories",
y="Mean Temperature",
palette = "Blues",
ci=None,
edgecolor="black",
order=["Freezer", "Refrigerator","Mapping",'Unknown','Incubator',"Production area","Storage"])
plt.bar_label(plot.containers[0], label_type='edge',padding=-25, fontweight='bold')
plt.title('Categories by Mean Temperature')
plt.xticks(rotation=45)
plt.show()
As can be seen, each gadget was assigned to one of three groups:
- Freezer: devices with temperature lower than 0.
Refrigerator: devices with temperature higher or equal than 0, but lower or equal than 8.Mapping: devices with temperature higher or equal than 14, but lower than 22.Unknown: devices with temperature outside the temperature range and we do not know for what it's being use.Incubator: devices with the mapping temperature but use in the medical facility.Production Area: devices with temperature higher than 17, but lower than 22 and use in food production.Storage: devices with temperature higher or equal than 14, but lower than 17 and use in food production.Because the data does not contain any labeling, such categorization is an obvious technique taken by group members through the usage of external resources.
Following the categorization of the devices, we want to observe each device's pattern through a line plot, which will allow us to approach any potential anomalies and subsequently approach the selection of view devices to make the research more comfortable.
#barchart per customer type
#size
fig=plt.figure(figsize=(11,7))
#categorical distribution
distribution = mean_temp_dev['Customer type'].value_counts()
#visual
plot = sns.barplot(distribution.values,
distribution.index,
palette = "vlag",
edgecolor="black")
plt.bar_label(plot.containers[0], label_type='edge',padding=-25)
#additionals
plt.title('Frequency Distribution of Customer Types')
plt.ylabel('Type',fontweight='bold',labelpad=20)
plt.xlabel('Number of Occurances',fontweight='bold',labelpad=20)
plt.show()
Here we plot every dataframe divided based on category from which we will pick the most apropriate dataset to train our model on. In order for dataframe to be deemed apropriate for further analysis it needs to fufill the following conditions:
Elimination will be aided by a series of generic graphics, such as line plots to assess consistency, followed by box plots to identify missing values (outliers), and finally a final selection of a device per group.
## AA, E, H, K, Y
# Height Margin
# plt.subplots_adjust(hspace=0.5)
fig, axes = plt.subplots(3,2, figsize=(18,18),constrained_layout=True)
fig.subplots_adjust(hspace=.3)
fig.suptitle('Line plot: Freezer Category',fontsize=20)
datalist = ["df_aa_daily","df_e_daily","df_h_daily","df_k_daily","df_y_daily",]
titles = ['Device: AA','Device: E','Device: H','Device: K','Device: Y',]
i = -1
j = 0
for idx, (dataset,title) in enumerate(zip(datalist,titles)):
## Change Modulo to from 2 to 3 to make a 3-column plot
if idx % 2 == 0:
i = i + 1
j = 0
else: j = j + 1
axes[i,j].plot(globals()['%s' % dataset]["Temp"],color='#C0D6EA')
axes[i,j].set_title(title,fontweight='bold',fontsize=13)
#fig.suptitle('Fridge Category')
#remove useless axes
fig.delaxes(axes[2,1])
plt.draw()
As can be seen, there are 5 devices in the freezer category, all of which detect a small number of anomalies and appear to be consistent in their patterns; however, with so many devices, it will be difficult to approach an analysis for all of them; therefore, to limit our research, we will cut out E and Y devices due to their unprogized prattens that could complicate the modeling, and we will leave them out.
## B, G, M, P, R, S, V, W, X, Z
# Height Margin
# plt.subplots_adjust(hspace=0.5)
fig, axes = plt.subplots(5,2, figsize=(18,30),constrained_layout=True)
fig.subplots_adjust(hspace=.3)
fig.suptitle('Line plot: Refrigerator Category',fontsize=20)
datalist = ["df_b_daily","df_g_daily","df_m_daily","df_p_daily","df_r_daily","df_s_daily","df_v_daily","df_w_daily","df_x_daily","df_z_daily"]
titles = ['Device: B','Device: G','Device: M','Device: P','Device: R','Device: S','Device: V','Device: W','Device: X','Device: Z']
i = -1
j = 0
for idx, (dataset,title) in enumerate(zip(datalist,titles)):
## Change Modulo to from 2 to 3 to make a 3-column plot
if idx % 2 == 0:
i = i + 1
j = 0
else: j = j + 1
axes[i,j].plot(globals()['%s' % dataset]["Temp"],color='#abcae4')
axes[i,j].set_title(title,fontweight='bold',fontsize=13)
plt.show()
In comparison to the freezer, the fridge approached more devices on its scale and thus more possible dor analysis, but it also stayed above due to a complication of the amount provided, which could slow down the research and modeling. For future steps, we would like to stick with device B, S, and G because of their lack of anomalies and pattern consistency.
## A, AB, AD, C, F, I, J, N, O, U
#image
fig, axes = plt.subplots(5,2, figsize=(18,30),constrained_layout=True)
fig.subplots_adjust(hspace=.3)
fig.suptitle('Line plot: Mapping Category',fontsize=20)
datalist = ["df_a_daily","df_ab_daily","df_ad_daily","df_c_daily","df_f_daily","df_i_daily","df_j_daily","df_n_daily","df_o_daily","df_u_daily",]
titles = ['Device: A','Device: AB','Device: AD','Device: C','Device: F','Device: I','Device: J','Device: N','Device: O','Device: U',]
i = -1
j = 0
for idx, (dataset,title) in enumerate(zip(datalist,titles)):
## Change Modulo to from 2 to 3 to make a 3-column plot
if idx % 2 == 0:
i = i + 1
j = 0
else: j = j + 1
axes[i,j].plot(globals()['%s' % dataset]["Temp"],color='#86b2d8')
axes[i,j].set_title(title,fontweight='bold',fontsize=13)
plt.show()
When it came to the pantry, it can be seen that the majority of the devices ended up in this group, which could be due to the scale range that we provided, but within a follow-up for the freezer and fridge, this part allowed us to see the pattern and provide additional limitations so that we could continue with our analysis phase.
fig, axes = plt.subplots(3,1, figsize=(18,13))
fig.subplots_adjust(hspace=.5) #fix up on layout
axes[0].plot(df_ac_daily["Temp"],color='#3d84bf')
axes[1].plot(df_q_daily["Temp"],color='#316a9a')
axes[2].plot(df_d_daily["Temp"],color='#255075')
axes[0].set_title('Device: AC - Incubator', fontsize=15)
axes[1].set_title('Device: Q - Production Area', fontsize=15)
axes[2].set_title('Device: D - Storage', fontsize=15)
plt.show()
After manually selecting three devices from each category, we're ready to dig further into them, moving on to a second categorical elimination in the form of box plots, where we can see their outliers and eliminate the devices with the most of them.
fig, axes = plt.subplots(1,3, figsize=(16,7))
fig.suptitle('Freezers', fontsize=20)
sns.boxplot(ax=axes[0],y="Temp",data=df_k_daily,color='#C0D6EA')
sns.boxplot(ax=axes[1],y="Temp",data=df_aa_daily,color='#C0D6EA')
sns.boxplot(ax=axes[2],y="Temp",data=df_h_daily,color='#C0D6EA')
axes[0].set_title('Device: K')
axes[1].set_title('Device: AA')
axes[2].set_title('Device: H')
plt.show()
Based on the results, we will choose device K since it has less outliers than devices AA and H, which can be easily spotted within a large number of them, slowing down our research and harming further modeling. As a consequence, we will continue to use device K for future study.
fig, axes = plt.subplots(1,3, figsize=(16,7))
fig.suptitle('Fridges',fontsize=20)
sns.boxplot(ax=axes[0],y="Temp",data=df_b_daily,color='#abcae4')
sns.boxplot(ax=axes[1],y="Temp",data=df_s_daily,color='#abcae4')
sns.boxplot(ax=axes[2],y="Temp",data=df_g_daily,color='#abcae4')
axes[0].set_title('Device: B')
axes[1].set_title('Device: S')
axes[2].set_title('Device: G')
plt.show()
In the case of the refrigerator, it is clear that device B will be eliminated due to its outliers, and that device G will be eliminated due to the outliers accumulating in the scale from 5 to 4 degrees. As a result, we will stick with device S, even though it contributed to a few of the outliers, because it is more useful for further research and modeling than the other two.
fig, axes = plt.subplots(1,3, figsize=(16,7))
fig.suptitle('Pantries',fontsize=20)
sns.boxplot(ax=axes[0],y="Temp",data=df_i_daily,color='#86b2d8')
sns.boxplot(ax=axes[1],y="Temp",data=df_o_daily,color='#86b2d8')
sns.boxplot(ax=axes[2],y="Temp",data=df_ab_daily,color='#86b2d8')
axes[0].set_title('Device: I')
axes[1].set_title('Device: O')
axes[2].set_title('Device: AB')
plt.show()
Following the same pattern as before, we will stick to device I as our main research recognition for the pantry category, based on the outliers in device D and AB. This is done with the idea that trained models with outliers will increase error variance and reduce the power of statistical tests for our modeling, so we approached to be sticking with device I.
fig, axes = plt.subplots(1,3, figsize=(16,7))
fig.suptitle('Extras',fontsize=20)
sns.boxplot(ax=axes[0],y="Temp",data=df_ac_daily,color='#3d84bf')
sns.boxplot(ax=axes[1],y="Temp",data=df_q_daily,color='#316a9a')
sns.boxplot(ax=axes[2],y="Temp",data=df_d_daily,color='#255075')
axes[0].set_title('Device: AC - Incubator')
axes[1].set_title('Device: Q - Production Area')
axes[2].set_title('Device: D - Storage')
plt.show()
Following the same pattern as before, even if device AC does not appear to have any outliers, it is still the device that requires additional supporting data to proceed with further research, thus the second apparent choice is device D, which has the second fewest outliers.
Based on the analysis that we've done, we decide to use Device K for the freezer, Device S for the Refrigerator, Device I for the mapping, and lastly Device D as a storage. All of these devices will be accompanied for the following step, which will include a deeper dig into them in terms of missing data, a general session overview, restrictions, and so on, as well as preparing them for modeling.
It can be seen that device K has a total of 59 missing values, which can be used to expand on and allocate with visuals. The following step will be implemented, which will aid in the detection of outliers..
From the plot below, we can see that on the first plot is showing the outliers from device K in each month, the second plot is showing the overview for the entire week, and lastly the third plot is showing us the overview for the whole day (24 Hours).
df_k_copy = df_k_daily.copy()
df_k_copy["Month"] = df_k_copy.index.to_period('M')
df_k_zoom = df_k.loc["2019-01-01 00:00:00":"2019-01-02 00:00:00"]
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(16,7))
plt.subplots_adjust(hspace=0.4)
fig.suptitle('Device K',fontsize=20)
axes[0].plot(df_k_weekly,color='black' )
axes[1].plot(df_k_zoom,color='#717D7E')
axes[0].set_title('Weekly Overview', fontsize=15,fontweight='bold')
axes[1].set_title('Hourly changes (24H overview)', fontsize=15,fontweight='bold')
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
The seasonal overview shows the spikes in temperature over time in a week's time frame, followed by hourly fluctuations. It can be seen that device K has some ups and downs in temperature, which is not a good indicator, but it is necessary to study more.
This will be a representation of the temperature view across the months in order to track the device's progress.
#copy the dataset
df_k_try=df_k.copy()
#convert dates variables into week, day and month
df_k_try["week"] = df_k_try.index.isocalendar().week
df_k_try["dayofmonth"] = df_k_try.index.day
df_k_try["month"] = df_k_try.index.month
#visualise the heatmap with each month and each day of the week
fig, ax = calmap.calendarplot(df_k_try['Temp'],
fillcolor='lightgrey',
#dayticks=[0, 2, 4, 6] can zoom into particular days
cmap='BuGn',
fig_kws={"figsize":(16,8)})
On the basis of the preceding representative, the day-by-day representation of temperature over time may be noticed. The device is mostly stable, with a few black patches that demonstrate some spiking and light grey representing the start and end of the dataset provided. This shows promising results, but it's also a good idea to take a closer look because that's where the next visualisation will appear.
#retrived from: https://github.com/KishManani/MSTL/blob/main/mstl_decomposition.ipynb
# Plot the electricity demand for each day
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=[15, 10], sharey=True)
fig.subplots_adjust(hspace=.5)
ax = ax.flatten()
sns_blue = sns.color_palette(as_cmap=True)[0]
MONTHS = ["Jan", "Feb", "Mar", "Apr", "May"]
for ix, month in enumerate(MONTHS):
# Plot individual ts
daily_ts = []
for _, ts in (
df_k_try[["Temp", "dayofmonth", "month"]]
.query(f"month == {ix+1}") #next month
.groupby("dayofmonth")
):
daily_ts.append(ts.reset_index()["Temp"])
ts.reset_index()["Temp"].plot(
alpha=0.1, ax=ax[ix], color=sns_blue, label="_no_legend_"
)
ax[ix].set_title(month)
# Plot the mean ts
pd.concat(daily_ts, axis=1).mean(axis=1).plot(
ax=ax[ix], color="blue", label="mean", legend=True
)
ax[ix].legend(loc="upper left", frameon=False)
fig.text(0.5, -0.02, "Hour of day", ha="center")
fig.text(-0.02, 0.5, "Temp", va="center", rotation="vertical")
fig.suptitle("Daily")
fig.delaxes(ax[-1])
fig.tight_layout()
As we can see from the plot below that the upper limit from Device K is -20 degrees and the lower limit is below -20 degrees.
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_k, color='#C0D6EA')
plt.axhline(y=-20,color='g',linestyle='-',label='Upper Limit',linewidth=2)
plt.legend(prop={'size': 10})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
As can be observed, there are a few spikes that exceed the device's upper limit, which is definitely an issue that has to be resolved. This is where the additional steps come in useful to allocate and resolve such outliers, as well as prepare the device K for further interaction.
Here we can check upon missing values of device K.
df_k.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315648 entries, 2018-11-18 16:40:00 to 2021-11-18 16:35:00 Freq: 5T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 315589 non-null float64 dtypes: float64(1) memory usage: 4.8 MB
df_k.isna().sum()
Temp 59 dtype: int64
As it appears that device K has 59 missing values, we may use outliers analysis to determine where they are.
With that in mind, let's look at the outliers using a density plot and a box plot. Outliers are data points that are markedly different from the rest. On the boxplot, this will be clearly visible.
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(df_k['Temp'],
color='#C0D6EA',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 3})
plt.margins(0.05)
#extras
plt.title('Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(df_k['Temp'],color='#C0D6EA')
plt.margins(0.05)
#extras
plt.title('Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
As can be seen for device K, there is a strong bias toward negative temperatures, which is understandable given that it is a freezer; nevertheless, it is also clear that it has a significant number of outliers, which will be resolved in a subsequent stage using IQR.
The Inter-Quartile Range proximity rule is abbreviated as IQR. Where Q1 and Q3 are the dataset's 25th and 75th percentiles, respectively, and IQR is the inter-quartile range, calculated as Q3 – Q1.
#IQR method
#Finding the IQR
per25 = df_k['Temp'].quantile(0.25)
per75 = df_k['Temp'].quantile(0.75)
#Finding the upper and lower limits
IQR=per75-per25
u_limit = per75 + 1.5 * IQR
l_limit = per25 - 1.5 * IQR
#Finding the outliers
trimed_df_k = df_k[df_k['Temp'] > u_limit]
trimed_df_k = df_k[df_k['Temp'] < l_limit]
trimed_df_k.sample(5)
| Temp | |
|---|---|
| EventDt | |
| 2020-09-16 02:50:00 | -29.52 |
| 2020-09-15 23:25:00 | -29.12 |
| 2020-09-13 16:10:00 | -29.61 |
| 2020-09-13 14:05:00 | -29.44 |
| 2020-09-16 10:00:00 | -29.94 |
As can be seen, the range is somewhat narrow, with the majority of the values being negative, which is an obvious observation. Let's look at the same graphs but with trimmed data to see the difference.
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(trimed_df_k['Temp'],
color='#C0D6EA',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 3})
plt.margins(0.05)
#extras
plt.title(' Trimmed Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(trimed_df_k['Temp'],color='#C0D6EA')
plt.margins(0.05)
#extras
plt.title(' Trimmed Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
While the density plot has more consistency, the boxplot still has some outliers, and to eliminate them, it is useful to utilize the capping approach, which will help to eliminate more values.
# an option to resolve outlier for like a final is to use capping
cap_df_k = df_k.copy()
#use a method for upper and lower limits
cap_df_k ['Temp'] = numpy.where(
cap_df_k ['Temp'] > u_limit,
u_limit,
numpy.where(
cap_df_k ['Temp'] < l_limit,
l_limit,
cap_df_k ['Temp']
)
)
#visualise the capping as a final result
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(cap_df_k['Temp'],
color='#C0D6EA',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 5})
plt.margins(0.05)
#extras
plt.title(' Capped Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(cap_df_k ['Temp'],
color='#C0D6EA')
plt.margins(0.05)
#extras
plt.title(' Capped Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
The destribution of temperature values is obvious now that the density and box plot have evolved, but in order to retain such limits in the future interaction, they must be analyzed by clustering and then removed from the dataset.
df_k_copy = df_k.resample('30min').mean()
df_k_copy = df_k_copy.dropna()
df_k_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:30:00 | -26.327500 |
| 2018-11-18 17:00:00 | -27.076667 |
| 2018-11-18 17:30:00 | -26.381667 |
| 2018-11-18 18:00:00 | -26.666667 |
| 2018-11-18 18:30:00 | -27.086667 |
| ... | ... |
| 2021-11-18 14:30:00 | -27.211667 |
| 2021-11-18 15:00:00 | -26.870000 |
| 2021-11-18 15:30:00 | -26.795000 |
| 2021-11-18 16:00:00 | -27.011667 |
| 2021-11-18 16:30:00 | -26.655000 |
52609 rows × 1 columns
outliers = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
clusters = outliers.fit_predict(df_k_copy)
df_k_copy['Cluster'] = clusters
df_k_copy
| Temp | Cluster | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:30:00 | -26.327500 | 0 |
| 2018-11-18 17:00:00 | -27.076667 | 0 |
| 2018-11-18 17:30:00 | -26.381667 | 0 |
| 2018-11-18 18:00:00 | -26.666667 | 0 |
| 2018-11-18 18:30:00 | -27.086667 | 0 |
| ... | ... | ... |
| 2021-11-18 14:30:00 | -27.211667 | 0 |
| 2021-11-18 15:00:00 | -26.870000 | 0 |
| 2021-11-18 15:30:00 | -26.795000 | 0 |
| 2021-11-18 16:00:00 | -27.011667 | 0 |
| 2021-11-18 16:30:00 | -26.655000 | 0 |
52609 rows × 2 columns
df_k_copy['ClusterName'] = df_k_copy['Cluster'].map({0: "Normal data", -1: "Outlier"})
df_k_copy
| Temp | Cluster | ClusterName | |
|---|---|---|---|
| EventDt | |||
| 2018-11-18 16:30:00 | -26.327500 | 0 | Normal data |
| 2018-11-18 17:00:00 | -27.076667 | 0 | Normal data |
| 2018-11-18 17:30:00 | -26.381667 | 0 | Normal data |
| 2018-11-18 18:00:00 | -26.666667 | 0 | Normal data |
| 2018-11-18 18:30:00 | -27.086667 | 0 | Normal data |
| ... | ... | ... | ... |
| 2021-11-18 14:30:00 | -27.211667 | 0 | Normal data |
| 2021-11-18 15:00:00 | -26.870000 | 0 | Normal data |
| 2021-11-18 15:30:00 | -26.795000 | 0 | Normal data |
| 2021-11-18 16:00:00 | -27.011667 | 0 | Normal data |
| 2021-11-18 16:30:00 | -26.655000 | 0 | Normal data |
52609 rows × 3 columns
plt.figure(figsize=(16,8))
sns.lineplot(data= df_k_copy, x= df_k_copy.index, y='Temp', hue = 'ClusterName')
<AxesSubplot:xlabel='EventDt', ylabel='Temp'>
df_k_out = df_k.dropna()
# Z score
z = np.abs(stats.zscore(df_k_out['Temp']))
print(z)
EventDt
2018-11-18 16:40:00 0.748278
2018-11-18 16:45:00 0.523413
2018-11-18 16:50:00 0.202178
2018-11-18 16:55:00 0.183305
2018-11-18 17:00:00 0.440293
...
2021-11-18 16:15:00 0.343922
2021-11-18 16:20:00 0.140473
2021-11-18 16:25:00 0.095099
2021-11-18 16:30:00 0.095099
2021-11-18 16:35:00 0.151181
Name: Temp, Length: 315589, dtype: float64
threshold =3
indexes = np.where(z > threshold)
# Position of the outlier
print(indexes)
(array([ 77678, 77679, 77680, 77681, 77682, 77683, 77684, 77685,
77686, 77687, 77688, 77689, 77690, 93455, 93456, 93457,
93458, 93459, 93460, 93461, 93462, 93463, 93464, 93465,
93466, 93467, 93468, 93469, 93470, 93471, 93472, 93473,
93474, 93475, 93476, 93477, 93478, 93479, 93480, 93481,
93482, 93483, 93484, 93485, 93517, 93518, 93519, 93520,
93521, 93522, 93523, 93524, 93525, 93526, 93527, 93528,
93529, 93530, 93531, 93532, 93533, 93534, 93535, 93536,
93537, 93538, 93539, 93540, 93541, 93542, 93543, 93544,
93545, 93546, 93547, 93548, 93549, 93550, 93551, 93552,
93553, 93554, 99537, 99538, 99539, 99540, 126350, 126351,
126352, 144478, 144479, 144480, 144481, 144482, 144483, 144484,
144485, 144486, 144487, 144488, 144489, 144490, 144491, 144492,
144493, 144494, 144495, 144496, 144497, 144498, 144499, 144500,
144501, 151922, 151923, 151924, 151925, 151926, 151927, 151928,
151929, 151930, 151931, 151932, 151933, 151934, 151935, 151936,
151937, 151938, 151939, 151940, 151941, 151942, 151943, 151944,
151945, 151946, 151947, 151948, 151949, 151950, 151951, 151952,
151953, 151954, 151955, 151956, 151957, 151958, 151959, 151960,
151961, 151962, 151963, 151964, 151965, 151966, 151967, 151968,
151969, 151970, 151971, 151972, 151973, 151974, 151975, 151976,
151977, 151978, 162321, 162322, 162323, 162324, 162325, 162326,
162327, 162328, 191256, 191257, 191258, 191259, 191260, 191261,
191262, 191263, 191264, 191265, 191266, 191267, 191268, 191269,
191270, 191271, 191272, 191273, 191274, 191275, 191276, 191277,
191278, 191279, 191280, 191281, 191282, 191283, 191284, 191285,
191286, 191287, 191288, 191289, 191290, 191291, 191292, 191293,
191294, 191295, 191296, 191297, 191344, 191346, 191347, 191348,
191349, 191350, 191351, 191352, 191353, 191354, 191355, 191356,
191357, 191358, 191359, 191360, 191361, 191362, 191363, 191364,
191365, 191366, 191367, 191368, 191369, 191370, 191371, 191372,
191373, 191374, 191375, 191376, 191377, 191378, 191379, 191380,
191381, 191382, 191383, 191384, 191385, 191386, 191387, 191388,
191389, 191390, 191391, 191392, 191393, 191394, 191395, 191396,
191397, 191398, 191399, 191400, 191401, 191402, 191443, 191444,
191445, 191446, 191447, 191448, 191449, 191450, 191451, 191452,
191453, 191454, 191455, 191456, 191457, 191458, 191459, 191460,
191461, 191462, 191463, 191464, 191465, 191466, 191467, 191468,
191469, 191470, 191471, 191472, 191473, 191474, 191475, 191476,
191477, 191478, 191479, 191480, 191481, 191482, 191483, 191484,
191485, 191486, 191487, 191488, 191489, 191490, 191491, 191492,
191493, 191494, 191495, 191496, 191497, 191498, 191499, 191500,
191501, 191502, 191503, 191504, 191505, 191506, 191507, 191542,
191543, 191544, 191545, 191546, 191547, 191548, 191549, 191550,
191551, 191552, 191553, 191554, 191555, 191556, 191557, 191558,
191559, 191560, 191561, 191562, 191563, 191564, 191565, 191566,
191567, 191568, 191569, 191570, 191571, 191572, 191573, 191574,
191575, 191576, 191577, 191578, 191579, 191580, 191581, 191582,
191583, 191584, 191585, 191586, 191587, 191588, 191589, 191590,
191591, 191592, 191593, 191594, 191595, 191596, 191597, 191598,
191599, 191600, 191601, 191602, 191603, 191604, 191605, 191606,
191607, 191608, 191609, 191610, 191611, 191612, 191638, 191639,
191640, 191641, 191642, 191643, 191644, 191645, 191646, 191647,
191648, 191649, 191650, 191651, 191652, 191653, 191654, 191655,
191656, 191657, 191658, 191659, 191660, 191661, 191662, 191663,
191664, 191665, 191666, 191667, 191668, 191669, 191670, 191671,
191672, 191673, 191674, 191675, 191676, 191677, 191678, 191679,
191680, 191681, 191682, 191683, 191684, 191685, 191686, 191687,
191688, 191689, 191690, 191691, 191692, 191693, 191694, 191695,
191696, 191697, 191698, 191699, 191700, 191701, 191702, 191703,
191704, 191705, 191706, 191707, 191708, 191709, 191710, 191711,
191712, 191713, 191714, 191715, 191758, 191759, 191760, 191761,
191762, 191763, 191764, 191765, 191766, 191767, 191768, 191769,
191770, 191771, 191772, 191773, 191774, 191775, 191776, 191777,
191778, 191779, 191780, 191781, 191782, 191783, 191784, 191785,
191786, 191787, 191788, 191789, 191790, 191791, 191792, 191793,
191794, 191795, 191796, 191797, 191798, 191799, 191800, 191801,
191802, 191803, 191804, 191805, 191806, 191807, 191808, 191809,
191810, 191811, 191812, 191813, 191814, 191815, 191816, 191817,
191818, 191854, 191855, 191856, 191857, 191858, 191859, 191860,
191861, 191862, 191863, 191864, 191865, 191866, 191867, 191868,
191869, 191870, 191871, 191872, 191873, 191874, 191875, 191876,
191877, 191878, 191879, 191880, 191881, 191882, 191883, 191884,
191885, 191886, 191887, 191888, 191889, 191890, 191891, 191892,
191893, 191894, 191895, 191896, 191897, 191898, 191899, 191900,
191901, 191902, 191903, 191904, 191905, 191906, 191907, 191908,
191909, 191910, 191911, 191912, 191913, 191914, 191915, 191916,
191917, 191918, 191919, 191920, 191921, 191922, 191923, 191955,
191956, 191957, 191958, 191959, 191960, 191961, 191962, 191963,
191964, 191965, 191966, 191967, 191968, 191969, 191970, 191971,
191972, 191973, 191974, 191975, 191976, 191977, 191978, 191979,
191980, 191981, 191982, 191983, 191984, 191985, 191986, 191987,
191988, 191989, 191990, 191991, 191992, 191993, 191994, 191995,
191996, 191997, 191998, 191999, 192000, 192001, 192002, 192003,
192004, 192005, 192006, 192007, 192008, 192009, 192010, 192011,
192012, 192013, 192014, 192015, 192016, 192017, 192018, 192019,
192020, 192021, 192022, 192023, 192024, 192025, 192090, 192091,
192092, 192093, 192094, 192095, 192096, 192097, 192098, 192099,
192100, 192101, 192102, 192103, 192104, 192105, 192106, 192107,
192108, 192109, 192110, 192111, 192112, 192113, 192114, 192115,
192116, 192117, 192118, 192119, 192120, 192121, 192122, 192123,
192124, 192125, 192126, 192127, 192128, 192129, 192167, 192168,
192169, 192170, 192171, 192172, 192173, 192174, 192175, 192176,
192177, 192178, 192179, 192180, 192181, 192182, 192183, 192184,
192185, 192186, 192187, 192188, 192189, 192190, 192191, 192192,
192193, 192194, 192195, 192196, 192197, 192198, 192199, 192200,
192201, 192202, 192203, 192204, 192205, 192206, 192207, 192208,
192209, 192210, 192211, 192212, 192213, 192214, 192215, 192216,
192217, 192218, 192219, 192220, 192221, 192222, 192223, 192224,
192225, 192226, 192227, 192228, 192229, 192230, 192231, 192232,
192233, 192234, 192235, 192250, 192251, 192252, 192253, 192254,
192255, 192256, 192257, 192258, 192259, 192260, 192261, 192262,
192263, 192264, 192265, 192266, 192267, 192268, 192269, 192270,
192271, 192272, 192273, 192274, 192275, 192276, 196865, 196866,
196867, 196868, 196869, 196870, 196871, 196872, 196873, 196874,
256756, 256757, 256758, 256759, 256760, 256761, 256762, 256763,
256764, 256765, 300565, 300566, 300567, 300568, 300569, 300570,
300571, 300572, 300573, 300574, 300575, 300581, 300582, 300593,
300594, 300595, 306858, 306859, 306860, 306861, 306862, 306863,
306864, 306865, 306866], dtype=int64),)
df_k_out.drop(df_k_out.index[indexes],axis=0,inplace=True)
df_k_out
WARNING - (py.warnings._showwarnmsg) - C:\Users\shane\anaconda3\lib\site-packages\pandas\core\frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:40:00 | -25.93 |
| 2018-11-18 16:45:00 | -26.14 |
| 2018-11-18 16:50:00 | -26.44 |
| 2018-11-18 16:55:00 | -26.80 |
| 2018-11-18 17:00:00 | -27.04 |
| ... | ... |
| 2021-11-18 16:15:00 | -26.95 |
| 2021-11-18 16:20:00 | -26.76 |
| 2021-11-18 16:25:00 | -26.54 |
| 2021-11-18 16:30:00 | -26.54 |
| 2021-11-18 16:35:00 | -26.77 |
314714 rows × 1 columns
# Normal = df_k_copy.drop(columns = ('Cluster','ClusterName'))
# Normal
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_k_out, color='#C0D6EA')
plt.axhline(y=-20,color='g',linestyle='-',label='Upper Limit',linewidth=2)
plt.legend(prop={'size': 10})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
Before going deeper into the device's initial analysis, it's a good idea to check for any missing data first, it comes down to similar steps as the previous device.
df_s.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315647 entries, 2018-11-18 16:50:00 to 2021-11-18 16:40:00 Freq: 5T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 315640 non-null float64 dtypes: float64(1) memory usage: 12.9 MB
df_s.isna().sum()
Temp 7 dtype: int64
As can be seen on this analysis, device S contains up to 7 missng values, which is much lower compare to the previous device and a total of 315.648.
From the plot below, we can see that on the first plot is showing the outliers from device S in each month, the second plot is showing the overview for the entire week, and lastly the third plot is showing us the overview for the whole day (24 Hours).
df_s_copy = df_s_daily.copy()
df_s_copy["Month"] = df_s_copy.index.to_period('M')
df_s_zoom = df_k.loc["2019-01-01 00:00:00":"2019-01-02 00:00:00"]
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(16,14))
plt.subplots_adjust(hspace=0.4)
sns.boxplot(ax=axes[0],x="Month",y="Temp",data=df_s_copy,palette = "rocket")
fig.suptitle('Device S',fontsize="xx-large")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation = 45)
axes[1].plot(df_s_weekly,color='black' )
axes[2].plot(df_s_zoom,color='#717D7E')
axes[0].set_title('Showing the Outliers', fontsize=15,fontweight='bold')
axes[1].set_title('Weekly Overview', fontsize=15,fontweight='bold')
axes[2].set_title('Hourly changes (24H overview)', fontsize=15,fontweight='bold')
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
Upper and lower limits are taken from another dataset; for example, for device S, the upper limit is 25 and the bottom limit is 8; to show this, as well as the device's consistency through time, see the visual below.
fig, axes = plt.subplots(1,1, figsize=(16,5))
fig.suptitle('Upper and Lower Limits for Device S',fontsize=15)
plt.plot(df_s,color="#abcae4")
plt.axhline(y=0,color='r',linestyle='-',label='Lower Limit',linewidth=3) #taken from another dataset
plt.axhline(y=4,color='g',linestyle='-',label='Upper Limit',linewidth=3)
plt.legend(prop={'size': 10})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
From the plot below we can see the actual dataset, we can the trend of the data, we can see if there's seasonality in the data, and lastly we can see if there's any residual in our data.
result = seasonal_decompose(df_s.dropna(), model='additive', period=1)
fig = result.plot()
fig.suptitle('Seasonal Decomposition for Device S',fontsize=18)
fig.set_size_inches((16, 9))
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
#extra
fig.tight_layout()
plt.show()
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(df_s['Temp'])
plt.margins(0.05)
#extras
plt.title('Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(df_s['Temp'])
plt.margins(0.05)
#extras
plt.title('Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
With that in mind, let's look at the outliers using a density plot and a box plot. Outliers are data points that are markedly different from the rest. On the boxplot, this will be clearly visible.
#IQR method
#Finding the IQR
perc25_s = df_s['Temp'].quantile(0.25)
perc75_s = df_s['Temp'].quantile(0.75)
#Finding the upper and lower limits
IQR_s = perc75_s - perc25_s
upper_limit_s = perc75_s + 1.5 * IQR_s
lower_limit_s = perc25_s - 1.5 * IQR_s
#Finding the outliers
trimed_df_s=df_s[df_s['Temp'] > upper_limit_s]
#df_s[df_s['Temp'] < lower_limit_s] nothing lower than already the lowest point maybe change the method but will see
trimed_df_s.sample(5)
| Temp | |
|---|---|
| EventDt | |
| 2019-09-21 18:10:00 | 5.30 |
| 2019-09-09 07:45:00 | 5.49 |
| 2019-09-09 03:45:00 | 4.45 |
| 2019-09-25 12:25:00 | 3.79 |
| 2021-08-12 10:50:00 | 3.97 |
Based on the samples, the approximate range is in the 3 to 6 range. It is also useful to visualize further to compare the findings of the broad visualisation above.
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(trimed_df_s['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(trimed_df_s['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
The trimmed data displays a consistent density plot with some suspicious spices in the middle, however the box plot is clear and inside the outliers' elimitations. Let's look at the data to see if anything is still missing.
trimed_df_s.isnull().sum()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_18696/2302863753.py in <module> ----> 1 trimed_df_s.isnull().sum() NameError: name 'trimed_df_s' is not defined
df_s_resample = df_s.resample('H').sum()
df_s_resample = df_s_resample.dropna()
df_s_resample
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:00:00 | 2.47 |
| 2018-11-18 17:00:00 | 13.70 |
| 2018-11-18 18:00:00 | 13.84 |
| 2018-11-18 19:00:00 | 17.38 |
| 2018-11-18 20:00:00 | 13.92 |
| ... | ... |
| 2021-11-18 12:00:00 | 10.58 |
| 2021-11-18 13:00:00 | 9.45 |
| 2021-11-18 14:00:00 | 6.82 |
| 2021-11-18 15:00:00 | 5.12 |
| 2021-11-18 16:00:00 | 2.66 |
26305 rows × 1 columns
outliers = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
clusters = outliers.fit_predict(df_s_resample)
df_s_resample['Cluster'] = clusters
df_s_resample
| Temp | Cluster | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:00:00 | 2.47 | 0 |
| 2018-11-18 17:00:00 | 13.70 | 0 |
| 2018-11-18 18:00:00 | 13.84 | 0 |
| 2018-11-18 19:00:00 | 17.38 | 0 |
| 2018-11-18 20:00:00 | 13.92 | 0 |
| ... | ... | ... |
| 2021-11-18 12:00:00 | 10.58 | 0 |
| 2021-11-18 13:00:00 | 9.45 | 0 |
| 2021-11-18 14:00:00 | 6.82 | 0 |
| 2021-11-18 15:00:00 | 5.12 | 0 |
| 2021-11-18 16:00:00 | 2.66 | 0 |
26305 rows × 2 columns
df_s_resample['ClusterName'] = df_s_resample['Cluster'].map({0: "Normal data", -1: "Outlier"})
df_s_resample
| Temp | Cluster | ClusterName | |
|---|---|---|---|
| EventDt | |||
| 2018-11-18 16:00:00 | 2.47 | 0 | Normal data |
| 2018-11-18 17:00:00 | 13.70 | 0 | Normal data |
| 2018-11-18 18:00:00 | 13.84 | 0 | Normal data |
| 2018-11-18 19:00:00 | 17.38 | 0 | Normal data |
| 2018-11-18 20:00:00 | 13.92 | 0 | Normal data |
| ... | ... | ... | ... |
| 2021-11-18 12:00:00 | 10.58 | 0 | Normal data |
| 2021-11-18 13:00:00 | 9.45 | 0 | Normal data |
| 2021-11-18 14:00:00 | 6.82 | 0 | Normal data |
| 2021-11-18 15:00:00 | 5.12 | 0 | Normal data |
| 2021-11-18 16:00:00 | 2.66 | 0 | Normal data |
26305 rows × 3 columns
plt.figure(figsize=(16,8))
sns.lineplot(data= df_s_resample, x= df_s_resample.index, y='Temp', hue = 'ClusterName')
<AxesSubplot:xlabel='EventDt', ylabel='Temp'>
df_s_copy = df_s.dropna()
# Z score
z = np.abs(stats.zscore(df_s_copy['Temp']))
print(z)
EventDt
2018-11-18 16:50:00 0.199490
2018-11-18 16:55:00 0.186415
2018-11-18 17:00:00 0.238718
2018-11-18 17:05:00 0.264869
2018-11-18 17:10:00 0.277945
...
2021-11-18 16:20:00 1.441689
2021-11-18 16:25:00 1.454764
2021-11-18 16:30:00 1.454764
2021-11-18 16:35:00 1.493992
2021-11-18 16:40:00 1.454764
Name: Temp, Length: 315640, dtype: float64
threshold =3
indexes = np.where(z > threshold)
# Position of the outlier
print(indexes)
(array([ 81181, 81185, 81730, 81731, 81732, 81733, 81734, 81735,
81736, 81737, 81738, 81739, 81740, 81741, 81742, 81743,
84759, 84760, 84761, 84762, 84763, 84764, 84765, 84766,
84767, 84768, 84769, 84770, 84771, 84772, 84773, 84774,
84775, 84776, 84777, 84778, 84779, 84780, 84781, 84782,
84783, 84784, 84785, 84786, 84787, 84788, 84789, 84790,
84791, 84792, 84793, 84794, 84795, 84796, 84797, 84798,
84799, 84800, 84801, 84802, 84803, 84804, 84805, 84806,
84807, 84808, 84809, 84810, 84811, 84812, 84813, 84814,
84815, 84816, 84817, 84818, 84819, 84820, 84821, 84822,
84823, 84824, 84825, 84826, 84827, 84828, 84829, 84830,
84831, 84832, 84833, 84834, 84835, 84836, 84837, 84838,
84839, 84840, 84841, 84842, 84843, 84844, 84845, 84846,
84847, 84848, 84849, 84850, 84851, 84852, 84853, 84854,
84855, 84856, 84857, 84858, 84859, 84860, 84861, 84862,
84863, 84864, 84865, 84866, 84867, 84868, 84869, 84870,
84871, 84872, 84873, 84874, 84875, 84876, 84877, 84878,
84879, 84880, 84881, 84882, 84883, 84884, 88360, 88361,
88362, 88363, 88364, 88365, 88366, 88367, 88368, 88369,
88370, 88371, 88372, 88373, 88374, 88375, 88376, 88377,
88378, 88379, 88380, 88381, 88382, 88383, 88384, 88385,
88386, 88387, 88388, 88389, 88390, 88391, 88392, 88393,
88394, 88395, 88396, 88397, 88398, 88399, 88400, 88401,
88402, 88403, 88404, 88405, 88406, 88407, 88408, 88409,
88410, 88411, 88412, 88413, 88414, 88415, 88416, 88417,
88418, 88419, 88420, 88421, 88422, 88423, 88424, 88425,
88426, 88427, 88428, 88429, 88430, 88431, 88432, 88433,
88434, 88435, 88436, 88437, 88438, 88439, 88440, 88441,
88442, 88443, 88444, 88445, 88446, 88447, 88448, 88449,
88450, 88451, 88452, 88453, 88454, 88455, 88456, 88457,
88458, 88459, 88460, 88461, 88462, 88463, 88464, 88465,
88466, 88467, 88468, 88469, 88470, 88471, 88472, 88473,
88474, 88475, 88476, 88477, 88478, 88479, 88480, 88481,
88482, 88483, 88484, 88485, 88486, 88487, 88488, 88489,
88490, 88491, 88492, 88493, 89504, 89505, 89506, 89512,
89513, 89514, 89515, 89516, 89517, 89518, 89519, 89520,
89521, 89522, 89523, 89524, 166394, 166395, 166396, 166397,
166398, 166399, 166400, 166401, 166402, 171046, 171047, 171048,
171049, 171050, 171051, 171052, 171053, 171054, 171055, 171056,
178538, 287340, 287341, 287342, 287343, 287344, 287345, 287346,
287347, 287348, 287349, 287350, 287351, 289368, 289369, 289370,
289371, 289372, 289373, 289374, 289375, 293405, 293406, 293407,
293408, 293409, 293410, 293411, 293412, 293413, 293414, 293415,
293416, 293417, 293418, 293419, 293420, 293421, 293422, 293423,
293424, 293425, 293426, 293427, 293428, 293429, 293430, 293431,
293432, 293433, 293434, 293443, 293444, 293445, 293446, 293447,
293448, 293449], dtype=int64),)
df_s_copy.drop(df_s_copy.index[indexes],axis=0,inplace=True)
df_s_copy
WARNING - (py.warnings._showwarnmsg) - C:\Users\shane\anaconda3\lib\site-packages\pandas\core\frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:50:00 | 1.23 |
| 2018-11-18 16:55:00 | 1.24 |
| 2018-11-18 17:00:00 | 1.20 |
| 2018-11-18 17:05:00 | 1.18 |
| 2018-11-18 17:10:00 | 1.17 |
| ... | ... |
| 2021-11-18 16:20:00 | 0.28 |
| 2021-11-18 16:25:00 | 0.27 |
| 2021-11-18 16:30:00 | 0.27 |
| 2021-11-18 16:35:00 | 0.24 |
| 2021-11-18 16:40:00 | 0.27 |
315270 rows × 1 columns
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_s_copy, color='#ca5c5d')
plt.axhline(y=0,color='r',linestyle='-',label='Lower Limit',linewidth=3) #taken from another dataset
plt.axhline(y=4,color='g',linestyle='-',label='Upper Limit',linewidth=3)
plt.legend(prop={'size': 20})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
df_i.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315648 entries, 2018-11-18 16:35:00 to 2021-11-18 16:30:00 Freq: 5T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 278713 non-null float64 dtypes: float64(1) memory usage: 12.9 MB
df_i.isna().sum()
Temp 36935 dtype: int64
sns.heatmap(df_i.isnull(), cbar=False)
<AxesSubplot:ylabel='EventDt'>
From the plot below, we can see that on the first plot is showing the outliers from device I in each month, the second plot is showing the overview for the entire week, and lastly the third plot is showing us the overview for the whole day (24 Hours).
df_i_copy = df_i_daily.copy()
df_i_copy["Month"] = df_i_copy.index.to_period('M')
df_i_zoom = df_i.loc["2019-01-01 00:00:00":"2019-01-02 00:00:00"]
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(16,14))
plt.subplots_adjust(hspace=0.4)
sns.boxplot(ax=axes[0],x="Month",y="Temp",data=df_i_copy,palette = "rocket")
fig.suptitle('Device I',fontsize="xx-large")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation = 45)
axes[1].plot(df_i_weekly,color='black' )
axes[2].plot(df_i_zoom,color='#717D7E')
axes[0].set_title('Showing the Outliers', fontsize=15,fontweight='bold')
axes[1].set_title('Weekly Overview', fontsize=15,fontweight='bold')
axes[2].set_title('Hourly changes (24H overview)', fontsize=15,fontweight='bold')
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
plt.show()
Upper and lower limits are taken from another dataset; for example, for device I, the upper limit is 25 and the bottom limit is 8; to show this, as well as the device's consistency through time, see the visual below.
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_i, color='#86b2d8')
plt.axhline(y=8,color='r',linestyle='-',label='Lower Limit',linewidth=2)
plt.axhline(y=25,color='g',linestyle='-',label='Upper Limit',linewidth=2)
plt.legend(prop={'size': 10})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
From the plot below we can see the actual dataset, we can the trend of the data, we can see if there's seasonality in the data, and lastly we can see if there's any residual in our data.
result = seasonal_decompose(df_i.dropna(), model='additive', period=1)
fig = result.plot()
fig.set_size_inches((16, 9))
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
# Tight layout to realign things
fig.tight_layout()
plt.show()
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(df_i['Temp'])
plt.margins(0.05)
#extras
plt.title('Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(df_i['Temp'])
plt.margins(0.05)
#extras
plt.title('Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
According to the density plot, the variable is responsible for two large spikes in the distribution, and the box plot has outliers on both sides; to clarify, the IQR method will be used.
#IQR method
#Finding the IQR
per25_i = df_i['Temp'].quantile(0.25)
per75_i = df_i['Temp'].quantile(0.75)
#Finding the upper and lower limits
IQR_i=per75_i - per25_i
u_limit_i = per75_i + 1.5 * IQR_i
l_limit_i = per25_i - 1.5 * IQR_i
#Finding the outliers
df_i[df_i['Temp'] > u_limit_i]
df_i[df_i['Temp'] < l_limit_i] #figure out how to store both
#check results
trimed_df_i.sample(5)
| Temp | |
|---|---|
| EventDt | |
| 2020-06-01 17:10:00 | 29.95 |
| 2020-06-01 15:40:00 | 29.96 |
| 2020-06-01 16:45:00 | 30.47 |
| 2020-06-01 16:10:00 | 30.30 |
| 2020-06-01 15:55:00 | 30.15 |
It's useful to compare the same images with a trimmed dataset after elimination.
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(trimed_df_i['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(trimed_df_i['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
It is visible as a clear focus on the upper limit of the data after trimming, but to eliminate it and also intarigate the lower limit, it is useful to apply capping and compare the visuals for the subsequent phases.
# an option to resolve outlier for like a final is to use capping
cap_df_i = df_i.copy()
#use a method for upper and lower limits
cap_df_i ['Temp'] = numpy.where(
cap_df_i ['Temp'] > u_limit_i,
u_limit_i,
numpy.where(
cap_df_i ['Temp'] < l_limit_i,
l_limit_i,
cap_df_i ['Temp']
)
)
#visualise the capping as a final result
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(cap_df_i['Temp'])
plt.margins(0.05)
#extras
plt.title(' Capped Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(cap_df_i ['Temp'])
plt.margins(0.05)
#extras
plt.title(' Capped Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
Before continuing on, let's check on the missing values. After capping, it's evident on the boundaries where the outliers are eliminated and the density of the variable is distributed in a range of 15 to 20, let's check on the missing values.
trimed_df_i.isnull().sum()
Temp 0 dtype: int64
df_i_copy = df_i.resample('10min').mean()
df_i_copy = df_i_copy.dropna()
df_i_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:30:00 | 14.600 |
| 2018-11-18 16:40:00 | 14.615 |
| 2018-11-18 16:50:00 | 14.580 |
| 2018-11-18 17:00:00 | 14.555 |
| 2018-11-18 17:10:00 | 14.530 |
| ... | ... |
| 2021-11-18 15:50:00 | 14.660 |
| 2021-11-18 16:00:00 | 14.645 |
| 2021-11-18 16:10:00 | 14.655 |
| 2021-11-18 16:20:00 | 14.645 |
| 2021-11-18 16:30:00 | 14.640 |
139366 rows × 1 columns
outliers = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
clusters = outliers.fit_predict(df_i_copy)
df_i_copy['Cluster'] = clusters
df_i_copy
| Temp | Cluster | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:30:00 | 14.600 | 0 |
| 2018-11-18 16:40:00 | 14.615 | 0 |
| 2018-11-18 16:50:00 | 14.580 | 0 |
| 2018-11-18 17:00:00 | 14.555 | 0 |
| 2018-11-18 17:10:00 | 14.530 | 0 |
| ... | ... | ... |
| 2021-11-18 15:50:00 | 14.660 | 0 |
| 2021-11-18 16:00:00 | 14.645 | 0 |
| 2021-11-18 16:10:00 | 14.655 | 0 |
| 2021-11-18 16:20:00 | 14.645 | 0 |
| 2021-11-18 16:30:00 | 14.640 | 0 |
139366 rows × 2 columns
df_i_copy['ClusterName'] = df_i_copy['Cluster'].map({0: "Normal data", -1: "Outlier"})
df_i_copy
| Temp | Cluster | ClusterName | |
|---|---|---|---|
| EventDt | |||
| 2018-11-18 16:30:00 | 14.600 | 0 | Normal data |
| 2018-11-18 16:40:00 | 14.615 | 0 | Normal data |
| 2018-11-18 16:50:00 | 14.580 | 0 | Normal data |
| 2018-11-18 17:00:00 | 14.555 | 0 | Normal data |
| 2018-11-18 17:10:00 | 14.530 | 0 | Normal data |
| ... | ... | ... | ... |
| 2021-11-18 15:50:00 | 14.660 | 0 | Normal data |
| 2021-11-18 16:00:00 | 14.645 | 0 | Normal data |
| 2021-11-18 16:10:00 | 14.655 | 0 | Normal data |
| 2021-11-18 16:20:00 | 14.645 | 0 | Normal data |
| 2021-11-18 16:30:00 | 14.640 | 0 | Normal data |
139366 rows × 3 columns
plt.figure(figsize=(16,8))
sns.lineplot(data= df_i_copy, x= df_i_copy.index, y='Temp', hue = 'ClusterName')
<AxesSubplot:xlabel='EventDt', ylabel='Temp'>
df_i_rmv = df_i.dropna()
# Z score
z = np.abs(stats.zscore(df_i_rmv['Temp']))
print(z)
EventDt
2018-11-18 16:35:00 0.827247
2018-11-18 16:40:00 0.818432
2018-11-18 16:45:00 0.827247
2018-11-18 16:50:00 0.836063
2018-11-18 16:55:00 0.830186
...
2021-11-18 16:10:00 0.812555
2021-11-18 16:15:00 0.809617
2021-11-18 16:20:00 0.815494
2021-11-18 16:25:00 0.812555
2021-11-18 16:30:00 0.815494
Name: Temp, Length: 278713, dtype: float64
threshold =3
indexes = np.where(z > threshold)
# Position of the outlier
print(indexes)
(array([157329, 157330, 157331, 157332, 157333, 157334, 157335, 157336,
157337, 157338, 157339, 157340, 157341, 157342, 157343, 157344,
157345, 157346, 157347, 157348, 157349, 157350, 157351, 157352,
157353, 157354, 157355, 157356, 157357, 157358, 157359, 157360,
157361, 157362, 157363, 157364, 157365, 157366, 157367, 157368,
157369, 157874, 157875, 157876, 157877, 157878, 157879, 157880,
157881, 157882, 157883, 157884, 157885, 157886, 157887, 157888,
157889, 157891, 157892, 157893, 157894, 157895, 157896, 157897,
157898, 157899, 157900, 157901, 157902, 157903, 157904, 157905,
157906, 157907, 157908, 157909, 256499, 256500, 256501, 256502,
256503, 256504, 256505, 256506, 256507, 256508, 256509, 256510,
256511, 256513, 256514, 256515, 256516, 256517, 256518, 256519,
256520, 256521, 256522], dtype=int64),)
df_i_rmv.drop(df_i_rmv.index[indexes],axis=0,inplace=True)
df_i_rmv
WARNING - (py.warnings._showwarnmsg) - C:\Users\shane\anaconda3\lib\site-packages\pandas\core\frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:35:00 | 14.60 |
| 2018-11-18 16:40:00 | 14.63 |
| 2018-11-18 16:45:00 | 14.60 |
| 2018-11-18 16:50:00 | 14.57 |
| 2018-11-18 16:55:00 | 14.59 |
| ... | ... |
| 2021-11-18 16:10:00 | 14.65 |
| 2021-11-18 16:15:00 | 14.66 |
| 2021-11-18 16:20:00 | 14.64 |
| 2021-11-18 16:25:00 | 14.65 |
| 2021-11-18 16:30:00 | 14.64 |
278614 rows × 1 columns
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_i_rmv, color='#ca5c5d')
plt.axhline(y=8,color='r',linestyle='-',label='Lower Limit',linewidth=2)
plt.axhline(y=25,color='g',linestyle='-',label='Upper Limit',linewidth=2)
plt.legend(prop={'size': 20})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
df_d.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315648 entries, 2018-11-18 16:15:00 to 2021-11-18 16:10:00 Freq: 5T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 313427 non-null float64 dtypes: float64(1) memory usage: 12.9 MB
df_d.isna().sum()
Temp 2221 dtype: int64
sns.heatmap(df_d.isnull(), cbar=False)
<AxesSubplot:ylabel='EventDt'>
df_d_copy = df_d_daily.copy()
df_d_copy["Month"] = df_d_copy.index.to_period('M')
df_d_zoom = df_d.loc["2019-01-01 00:00:00":"2019-01-02 00:00:00"]
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(16,14))
plt.subplots_adjust(hspace=0.4)
sns.boxplot(ax=axes[0],x="Month",y="Temp",data=df_d_copy,palette = "rocket")
fig.suptitle('Device D',fontsize="xx-large")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation = 45)
axes[1].plot(df_d_weekly,color='black' )
axes[2].plot(df_d_zoom,color='#717D7E')
axes[0].set_title('Showing the Outliers', fontsize=15,fontweight='bold')
axes[1].set_title('Weekly Overview', fontsize=15,fontweight='bold')
axes[2].set_title('Hourly changes (24H overview)', fontsize=15,fontweight='bold')
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
plt.show()
From the plot below we can see the actual dataset, we can the trend of the data, we can see if there's seasonality in the data, and lastly we can see if there's any residual in our data.
result = seasonal_decompose(df_d.dropna(), model='additive', period=1)
fig = result.plot()
fig.suptitle('Seasonal Decomposition for Device D',fontsize=18)
fig.set_size_inches((16, 7))
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
# Tight layout to realign things
fig.tight_layout()
plt.show()
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(df_d['Temp'])
plt.margins(0.05)
#extras
plt.title('Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(df_d['Temp'])
plt.margins(0.05)
#extras
plt.title('Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
It's a densely distributed density plot, but it's also clear in a boxplot that this device has a lot of them, and the next step will be to evaluate the borders to help limit them in the future.
#IQR method
#Finding the IQR
per25_d = df_d['Temp'].quantile(0.25)
per75_d = df_d['Temp'].quantile(0.75)
#Finding the upper and lower limits
IQR_d=per75_d-per25_d
u_limit_d = per75_d + 1.5 * IQR_d
l_limit_d = per25_d - 1.5 * IQR_d
#Finding the outliers
trimed_df_d = df_d[df_d['Temp'] > u_limit_d]
df_d[df_d['Temp'] < l_limit_d] #check upon fixing this
#check results
trimed_df_d.sample(5)
| Temp | |
|---|---|
| EventDt | |
| 2021-07-19 14:00:00 | 29.62 |
| 2019-07-23 17:55:00 | 30.07 |
| 2019-07-23 12:10:00 | 29.86 |
| 2021-07-22 20:05:00 | 29.34 |
| 2019-08-02 12:55:00 | 28.74 |
After trimming, it's crucial to visualize the top limit to see the difference between the previous phase and then proceed.
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(trimed_df_d['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(trimed_df_d['Temp'])
plt.margins(0.05)
#extras
plt.title(' Trimmed Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
Despite the fact that there are some outliers to account for, the density is clearly approaching the left side and subsequently decreasing on the right, capping is useful to include the lower limits.
# an option to resolve outlier for like a final is to use capping
cap_df_d = df_d.copy()
#use a method for upper and lower limits
cap_df_d ['Temp'] = numpy.where(
cap_df_d ['Temp'] > u_limit_d,
u_limit_d,
numpy.where(
cap_df_d ['Temp'] < l_limit_d,
l_limit_d,
cap_df_d ['Temp']
)
)
#visualise the capping as a final result
#desntiy plot
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(cap_df_d['Temp'])
plt.margins(0.05)
#extras
plt.title(' Capped Density plot',fontsize=20,y=1.1)
plt.ylabel('Density',fontweight='bold',labelpad=20)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
#box plot
plt.subplot(1,2,2)
sns.boxplot(cap_df_d ['Temp'])
plt.margins(0.05)
#extras
plt.title(' Capped Box plot',fontsize=20,y=1.1)
plt.xlabel('Temp',fontweight='bold',labelpad=20)
plt.show()
The desnity is evident and strong, with about equal skewing, and the box plot also illustrates the elimination of outliers, with both graphs in the same temperature range for storage. It is also crucial to check for missing values before going towards seasonality.
trimed_df_d.isnull().sum()
Temp 0 dtype: int64
df_d_copy = df_d.resample('10min').mean()
df_d_copy = df_d_copy.dropna()
df_d_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:10:00 | 15.160 |
| 2018-11-18 16:20:00 | 14.955 |
| 2018-11-18 16:30:00 | 15.390 |
| 2018-11-18 16:40:00 | 18.755 |
| 2018-11-18 16:50:00 | 21.965 |
| ... | ... |
| 2021-11-18 15:30:00 | 21.425 |
| 2021-11-18 15:40:00 | 21.520 |
| 2021-11-18 15:50:00 | 21.500 |
| 2021-11-18 16:00:00 | 21.600 |
| 2021-11-18 16:10:00 | 21.660 |
156764 rows × 1 columns
outliers = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
clusters = outliers.fit_predict(df_d_copy)
df_d_copy['Cluster'] = clusters
df_d_copy
| Temp | Cluster | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:10:00 | 15.160 | 0 |
| 2018-11-18 16:20:00 | 14.955 | 0 |
| 2018-11-18 16:30:00 | 15.390 | 0 |
| 2018-11-18 16:40:00 | 18.755 | 0 |
| 2018-11-18 16:50:00 | 21.965 | 0 |
| ... | ... | ... |
| 2021-11-18 15:30:00 | 21.425 | 0 |
| 2021-11-18 15:40:00 | 21.520 | 0 |
| 2021-11-18 15:50:00 | 21.500 | 0 |
| 2021-11-18 16:00:00 | 21.600 | 0 |
| 2021-11-18 16:10:00 | 21.660 | 0 |
156764 rows × 2 columns
df_d_copy['ClusterName'] = df_d_copy['Cluster'].map({0: "Normal data", -1: "Outlier"})
df_d_copy
| Temp | Cluster | ClusterName | |
|---|---|---|---|
| EventDt | |||
| 2018-11-18 16:10:00 | 15.160 | 0 | Normal data |
| 2018-11-18 16:20:00 | 14.955 | 0 | Normal data |
| 2018-11-18 16:30:00 | 15.390 | 0 | Normal data |
| 2018-11-18 16:40:00 | 18.755 | 0 | Normal data |
| 2018-11-18 16:50:00 | 21.965 | 0 | Normal data |
| ... | ... | ... | ... |
| 2021-11-18 15:30:00 | 21.425 | 0 | Normal data |
| 2021-11-18 15:40:00 | 21.520 | 0 | Normal data |
| 2021-11-18 15:50:00 | 21.500 | 0 | Normal data |
| 2021-11-18 16:00:00 | 21.600 | 0 | Normal data |
| 2021-11-18 16:10:00 | 21.660 | 0 | Normal data |
156764 rows × 3 columns
plt.figure(figsize=(16,8))
sns.lineplot(data= df_d_copy, x= df_d_copy.index, y='Temp', hue = 'ClusterName')
<AxesSubplot:xlabel='EventDt', ylabel='Temp'>
df_d_rmv = df_d.dropna()
# Z score
z = np.abs(stats.zscore(df_d_rmv['Temp']))
print(z)
EventDt
2018-11-18 16:15:00 0.854694
2018-11-18 16:20:00 0.889414
2018-11-18 16:25:00 0.929475
2018-11-18 16:30:00 0.945499
2018-11-18 16:35:00 0.641035
...
2021-11-18 15:50:00 0.846569
2021-11-18 15:55:00 0.830544
2021-11-18 16:00:00 0.870606
2021-11-18 16:05:00 0.859923
2021-11-18 16:10:00 0.881289
Name: Temp, Length: 313427, dtype: float64
threshold =3
indexes = np.where(z > threshold)
# Position of the outlier
print(indexes)
(array([ 13109, 13110, 13111, 13112, 13113, 21955, 21956, 21957,
21958, 21959, 21960, 21961, 21962, 21963, 21964, 21965,
21966, 21967, 21968, 21969, 21970, 21971, 21972, 21973,
21974, 21975, 21976, 21977, 21978, 21979, 21980, 21981,
21982, 21983, 21984, 21985, 21986, 21987, 21988, 21989,
21990, 21991, 21992, 21993, 21994, 21995, 21996, 21997,
21998, 21999, 22000, 22001, 22002, 22003, 22004, 22005,
22006, 22007, 22008, 22009, 22010, 22011, 22012, 22013,
22014, 22015, 22016, 22017, 22018, 22019, 22020, 22021,
22022, 22023, 22024, 22025, 22026, 22027, 22028, 22029,
22030, 22031, 22032, 22033, 22034, 22035, 22036, 22037,
22038, 22039, 22040, 22041, 22042, 22043, 22044, 22045,
22046, 22047, 22048, 22049, 22050, 22051, 22052, 22053,
22054, 22055, 22056, 22057, 22058, 22059, 22060, 22061,
22062, 22063, 22064, 22065, 22066, 22067, 22068, 22069,
22070, 22071, 22072, 22073, 22074, 22075, 22076, 22077,
22078, 22079, 22080, 22081, 22082, 22083, 22084, 22085,
22086, 22087, 22088, 22089, 22090, 22091, 22092, 22093,
71074, 71075, 71076, 71077, 71078, 71079, 71080, 71081,
71082, 71083, 71084, 71085, 71086, 71087, 71088, 71089,
71090, 71091, 71092, 71093, 71094, 71095, 71096, 71097,
71098, 71099, 71100, 71101, 71102, 71103, 71104, 71105,
71106, 71107, 71108, 71109, 71110, 71111, 71112, 71113,
71114, 71115, 71116, 71117, 71118, 71119, 71120, 71121,
71122, 71123, 71124, 71125, 71126, 71127, 71128, 71129,
71130, 71131, 71132, 71133, 71134, 71135, 71136, 71137,
71138, 71139, 71140, 71141, 71142, 71143, 71144, 71145,
71146, 71147, 71148, 71149, 71150, 71151, 71152, 71153,
71154, 71155, 71632, 71633, 71634, 71635, 71636, 71637,
71638, 71639, 71640, 71641, 71642, 71643, 71644, 71645,
71646, 71647, 71648, 71649, 71650, 71651, 71652, 71653,
71654, 71655, 71656, 71657, 71658, 71659, 71660, 71661,
71662, 71663, 71664, 71665, 71666, 71667, 71668, 71669,
71670, 71671, 71672, 71673, 71674, 71675, 71676, 71677,
71678, 71679, 71680, 71681, 71682, 71683, 71684, 71685,
71686, 71687, 71688, 71689, 71690, 71691, 71692, 71693,
71694, 71695, 71696, 71697, 71698, 71699, 71700, 71701,
71702, 71703, 71704, 71705, 71706, 71707, 71708, 71709,
71710, 71711, 71712, 71713, 71714, 71715, 71716, 71717,
71718, 71719, 71720, 71721, 71722, 71723, 71724, 71725,
71726, 71727, 71728, 71729, 71730, 71731, 71732, 71733,
71734, 71735, 71736, 71737, 71738, 71739, 71740, 71741,
71742, 71743, 71744, 71745, 80618, 96358, 99694, 104238,
104239, 168412, 168413, 168414, 168415, 168416, 168417, 168418,
168419, 168420, 168421, 168422, 168423, 168424, 168425, 168426,
168427, 168428, 168429, 178765, 178766, 178767, 178768, 178769,
178770, 178771, 178772, 178773, 178774, 178775, 225511, 225512,
225513, 225514, 225515, 225517, 225518, 225519, 225523, 225524,
225526, 225528, 225529, 225530, 225531, 225532, 225533, 225534,
278268, 278269, 278270, 278271, 278272, 278273, 278279, 278280,
278281, 278282, 278283, 278284, 278285, 278286, 278287, 278611,
278612, 278613, 278846, 278847, 278848, 278849, 278850, 278851,
278852, 278853, 278854, 278855, 278856, 278857, 278858, 278859,
278860, 278861, 278862, 278863, 278864, 278865, 278866, 278867,
278868, 278869, 278870, 278871, 278872, 278873, 278874, 278875,
278876, 278877, 278878, 278879, 278880, 278881, 278882, 278883,
278884, 278885, 278886, 278887, 278888, 278889, 278890, 278891,
278892, 278893, 278894, 278895, 278896, 278897, 278898, 278899,
278900, 278901, 278902, 278903, 278904, 278905, 278906, 278907,
278908, 278909, 278910, 278911, 278912, 278913, 278914, 278915,
278916, 278917, 278918, 279118, 279119, 279120, 279121, 279122,
279123, 279124, 279125, 279126, 279127, 279128, 279129, 279130,
279131, 279132, 279133, 279134, 279135, 279136, 279137, 279138,
279139, 279140, 279141, 279142, 279143, 279144, 279145, 279146,
279147, 279148, 279149, 279150, 279151, 279152, 279153, 279154,
279155, 279156, 279157, 279158, 279159, 279160, 279161, 279162,
279163, 279164, 279165, 279166, 279167, 279168, 279169, 279170,
279171, 279172, 279173, 279174, 279175, 279176, 279177, 279178,
279179, 279180, 279181, 279182, 279183, 279184, 279185, 279186,
279187, 279188, 279189, 279190, 279191, 279192, 279193, 279194,
279195, 279196, 279197, 279198, 279199, 279200, 279201, 279202,
279203, 279204, 279205, 279206, 279207, 279208, 279209, 279210,
279211, 279212, 279213, 279214, 279433, 279434, 279435, 279436,
279437, 279438, 279439, 279440, 279441, 279442, 279443, 279444,
279445, 279446, 279447, 279448, 279449, 279450, 279451, 279452,
279453, 279454, 279455, 279456, 279457, 279458, 279459, 279460,
279461, 279462, 279463, 279464, 279465, 279466, 279467, 279468,
279469, 279470, 279471, 279472, 279473, 279474, 279475, 279476,
279477, 279478, 279479, 279480, 279481, 279482, 279483, 279484],
dtype=int64),)
df_d_rmv.drop(df_d_rmv.index[indexes],axis=0,inplace=True)
df_d_rmv
WARNING - (py.warnings._showwarnmsg) - C:\Users\shane\anaconda3\lib\site-packages\pandas\core\frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:15:00 | 15.16 |
| 2018-11-18 16:20:00 | 15.03 |
| 2018-11-18 16:25:00 | 14.88 |
| 2018-11-18 16:30:00 | 14.82 |
| 2018-11-18 16:35:00 | 15.96 |
| ... | ... |
| 2021-11-18 15:50:00 | 21.53 |
| 2021-11-18 15:55:00 | 21.47 |
| 2021-11-18 16:00:00 | 21.62 |
| 2021-11-18 16:05:00 | 21.58 |
| 2021-11-18 16:10:00 | 21.66 |
312795 rows × 1 columns
fig, axes = plt.subplots(1,1, figsize=(16,5))
plt.plot(df_d_rmv, color='#ca5c5d')
plt.legend(prop={'size': 20})
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.show()
No handles with labels found to put in legend.
m = NeuralProphet()
# help(m)
df_i_copy = df_i
df_i_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:35:00 | 14.60 |
| 2018-11-18 16:40:00 | 14.63 |
| 2018-11-18 16:45:00 | 14.60 |
| 2018-11-18 16:50:00 | 14.57 |
| 2018-11-18 16:55:00 | 14.59 |
| ... | ... |
| 2021-11-18 16:10:00 | 14.65 |
| 2021-11-18 16:15:00 | 14.66 |
| 2021-11-18 16:20:00 | 14.64 |
| 2021-11-18 16:25:00 | 14.65 |
| 2021-11-18 16:30:00 | 14.64 |
315648 rows × 1 columns
def ChangeDatasetForNeural(data):
data["ds"] = data.index
data = data.rename(columns = {'Temp':'y'}, inplace = True)
data
ChangeDatasetForNeural(df_i_copy)
df_i_copy.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 315648 entries, 2018-11-18 16:35:00 to 2021-11-18 16:30:00 Freq: 5T Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 y 278713 non-null float64 1 ds 315648 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1) memory usage: 15.3 MB
warnings.filterwarnings('ignore')
m = NeuralProphet(yearly_seasonality=True,
# weekly_seasonality=False,
# daily_seasonality=False,
epochs=30
)
metrics = m.fit(df_i_copy, freq="D")
metrics
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 100.0% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.forecaster.__handle_missing_data) - dropped 36935 NAN row in 'y' INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 128
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 9.63E-02, min: 1.42E+00
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.08E-01, min: 1.58E+00 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.01E-01 Epoch[30/30]: 100%|█████████████| 30/30 [02:42<00:00, 5.43s/it, SmoothL1Loss=0.00125, MAE=0.882, RMSE=1.14, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 0.144454 | 7.120226 | 8.807730 | 0.0 |
| 1 | 0.001392 | 0.933123 | 1.203727 | 0.0 |
| 2 | 0.001614 | 1.009015 | 1.297013 | 0.0 |
| 3 | 0.002078 | 1.153540 | 1.472602 | 0.0 |
| 4 | 0.002634 | 1.302832 | 1.657626 | 0.0 |
| 5 | 0.003183 | 1.439530 | 1.819869 | 0.0 |
| 6 | 0.003659 | 1.542789 | 1.949148 | 0.0 |
| 7 | 0.004242 | 1.661180 | 2.092652 | 0.0 |
| 8 | 0.004222 | 1.658254 | 2.088269 | 0.0 |
| 9 | 0.004362 | 1.686928 | 2.125060 | 0.0 |
| 10 | 0.004369 | 1.687505 | 2.125258 | 0.0 |
| 11 | 0.004083 | 1.636513 | 2.060614 | 0.0 |
| 12 | 0.003918 | 1.599775 | 2.017023 | 0.0 |
| 13 | 0.003803 | 1.568281 | 1.979653 | 0.0 |
| 14 | 0.003598 | 1.529289 | 1.930130 | 0.0 |
| 15 | 0.003312 | 1.468995 | 1.856270 | 0.0 |
| 16 | 0.003121 | 1.423439 | 1.802384 | 0.0 |
| 17 | 0.002783 | 1.345538 | 1.706139 | 0.0 |
| 18 | 0.002593 | 1.294136 | 1.643721 | 0.0 |
| 19 | 0.002446 | 1.256024 | 1.598694 | 0.0 |
| 20 | 0.002202 | 1.190325 | 1.517104 | 0.0 |
| 21 | 0.001987 | 1.127910 | 1.441311 | 0.0 |
| 22 | 0.001860 | 1.087804 | 1.393282 | 0.0 |
| 23 | 0.001728 | 1.047388 | 1.343303 | 0.0 |
| 24 | 0.001610 | 1.008683 | 1.296144 | 0.0 |
| 25 | 0.001493 | 0.969262 | 1.247657 | 0.0 |
| 26 | 0.001394 | 0.934537 | 1.204261 | 0.0 |
| 27 | 0.001327 | 0.909690 | 1.174538 | 0.0 |
| 28 | 0.001276 | 0.891342 | 1.151115 | 0.0 |
| 29 | 0.001249 | 0.881930 | 1.138822 | 0.0 |
future = m.make_future_dataframe(df_i_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 100.0% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
forecast = m.predict(future)
fig1 = m.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.884% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.884% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
forecast
| ds | y | yhat1 | residual1 | trend | season_yearly | season_weekly | season_daily | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2018-11-18 16:35:00 | 14.60 | 15.339128 | 0.739128 | 17.603092 | -2.484297 | -0.535900 | 0.756234 |
| 1 | 2018-11-18 16:40:00 | 14.63 | 15.338090 | 0.708090 | 17.603178 | -2.484470 | -0.535435 | 0.754818 |
| 2 | 2018-11-18 16:45:00 | 14.60 | 15.336798 | 0.736798 | 17.603262 | -2.484643 | -0.534960 | 0.753139 |
| 3 | 2018-11-18 16:50:00 | 14.57 | 15.335238 | 0.765238 | 17.603346 | -2.484817 | -0.534476 | 0.751185 |
| 4 | 2018-11-18 16:55:00 | 14.59 | 15.333398 | 0.743398 | 17.603430 | -2.484990 | -0.533983 | 0.748941 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 316008 | 2022-11-14 16:30:00 | NaN | 17.999220 | NaN | 19.717846 | -2.279892 | -0.196133 | 0.757400 |
| 316009 | 2022-11-15 16:30:00 | NaN | 18.294107 | NaN | 19.726618 | -2.332093 | 0.142182 | 0.757400 |
| 316010 | 2022-11-16 16:30:00 | NaN | 18.371405 | NaN | 19.735392 | -2.383390 | 0.262002 | 0.757400 |
| 316011 | 2022-11-17 16:30:00 | NaN | 18.335726 | NaN | 19.744164 | -2.433996 | 0.268157 | 0.757400 |
| 316012 | 2022-11-18 16:30:00 | NaN | 18.324465 | NaN | 19.752939 | -2.484123 | 0.298249 | 0.757400 |
316013 rows × 8 columns
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()
fig2 = m.plot_components(forecast)
## https://neuralprophet.com/html/test_and_crossvalidate.html
## Need to add crossvalidation
df_k_daily
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 | -26.699659 |
| 2018-11-19 | -26.833437 |
| 2018-11-20 | -26.843924 |
| 2018-11-21 | -26.860833 |
| 2018-11-22 | -26.911181 |
| ... | ... |
| 2021-11-14 | -26.610556 |
| 2021-11-15 | -26.636389 |
| 2021-11-16 | -26.782951 |
| 2021-11-17 | -26.773368 |
| 2021-11-18 | -26.819950 |
1097 rows × 1 columns
plt.rc('figure',figsize=(14,8))
plt.rc('font',size=15)
result = seasonal_decompose(df_k_daily,model='additive')
fig = result.plot()
model = sm.tsa.statespace.SARIMAX(df_k_daily['Temp'],
order=(1,1,1),
seasonal_order=(1,1,0,12))
sarima = model.fit()
predictions = sarima.predict()
plt.figure(figsize=(16,4))
plt.plot(df_k_daily, label="Actual")
plt.plot(predictions, label="Predicted")
plt.title('Comparison of predicted and actual data', fontsize=20)
plt.ylabel('Temperature in celsius', fontsize=16)
plt.legend()
<matplotlib.legend.Legend at 0x1facd02e880>
forecast = sarima.forecast(100)
ax = df_k_daily.plot(label='observed', figsize=(20, 15))
forecast.plot(ax=ax, label='Forecast')
ax.set_xlabel('Date')
ax.set_ylabel('Temperature in celsius')
plt.legend()
plt.show()
# Seasonal - fit stepwise auto-ARIMA
smodel = pm.auto_arima(df_k_daily, start_p=1, start_q=1,
test='adf',
max_p=1, max_q=1, m=12,
start_P=0, seasonal=True,
d=None, D=1, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
Performing stepwise search to minimize aic ARIMA(1,0,1)(0,1,1)[12] intercept : AIC=inf, Time=2.83 sec ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=1624.175, Time=0.17 sec ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=812.863, Time=1.00 sec ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=986.762, Time=1.10 sec ARIMA(0,0,0)(0,1,0)[12] : AIC=1622.177, Time=0.06 sec ARIMA(1,0,0)(0,1,0)[12] intercept : AIC=1078.422, Time=0.26 sec ARIMA(1,0,0)(2,1,0)[12] intercept : AIC=701.203, Time=2.45 sec ARIMA(1,0,0)(2,1,1)[12] intercept : AIC=inf, Time=9.76 sec ARIMA(1,0,0)(1,1,1)[12] intercept : AIC=inf, Time=2.55 sec ARIMA(0,0,0)(2,1,0)[12] intercept : AIC=1438.837, Time=1.52 sec ARIMA(1,0,1)(2,1,0)[12] intercept : AIC=647.388, Time=4.06 sec ARIMA(1,0,1)(1,1,0)[12] intercept : AIC=772.897, Time=1.42 sec ARIMA(1,0,1)(2,1,1)[12] intercept : AIC=inf, Time=10.20 sec ARIMA(1,0,1)(1,1,1)[12] intercept : AIC=inf, Time=4.54 sec ARIMA(0,0,1)(2,1,0)[12] intercept : AIC=1031.394, Time=2.07 sec ARIMA(1,0,1)(2,1,0)[12] : AIC=645.388, Time=1.28 sec ARIMA(1,0,1)(1,1,0)[12] : AIC=770.897, Time=0.56 sec ARIMA(1,0,1)(2,1,1)[12] : AIC=inf, Time=10.18 sec ARIMA(1,0,1)(1,1,1)[12] : AIC=inf, Time=3.76 sec ARIMA(0,0,1)(2,1,0)[12] : AIC=1029.406, Time=0.76 sec ARIMA(1,0,0)(2,1,0)[12] : AIC=699.205, Time=0.80 sec ARIMA(0,0,0)(2,1,0)[12] : AIC=1436.853, Time=0.50 sec Best model: ARIMA(1,0,1)(2,1,0)[12] Total fit time: 61.834 seconds
data = df_k_daily
# Forecast
n_periods = 12
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(data.index[-1], periods = n_periods, freq='MS')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
plt.plot(data)
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.15)
plt.title("SARIMA - Forecast of temperature change")
plt.show()
df_k_hourly = df_k.resample('H').mean()
df_k_hourly
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:00:00 | -26.327500 |
| 2018-11-18 17:00:00 | -26.729167 |
| 2018-11-18 18:00:00 | -26.876667 |
| 2018-11-18 19:00:00 | -26.638333 |
| 2018-11-18 20:00:00 | -26.570833 |
| ... | ... |
| 2021-11-18 12:00:00 | -26.644167 |
| 2021-11-18 13:00:00 | -27.124167 |
| 2021-11-18 14:00:00 | -26.960833 |
| 2021-11-18 15:00:00 | -26.832500 |
| 2021-11-18 16:00:00 | -26.922500 |
26305 rows × 1 columns
# Seasonal - fit stepwise auto-ARIMA
smodel = pm.auto_arima(df_k_hourly, start_p=1, start_q=1,
test='adf',
max_p=1, max_q=1, m=12,
start_P=0, seasonal=True,
d=None, D=1, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
Performing stepwise search to minimize aic ARIMA(1,0,1)(0,1,1)[12] intercept : AIC=inf, Time=61.54 sec ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=57074.132, Time=3.53 sec ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=40597.371, Time=26.83 sec ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=inf, Time=37.68 sec ARIMA(0,0,0)(0,1,0)[12] : AIC=57072.133, Time=1.13 sec ARIMA(1,0,0)(0,1,0)[12] intercept : AIC=48515.830, Time=2.79 sec ARIMA(1,0,0)(2,1,0)[12] intercept : AIC=37728.162, Time=65.57 sec ARIMA(1,0,0)(2,1,1)[12] intercept : AIC=inf, Time=199.46 sec ARIMA(1,0,0)(1,1,1)[12] intercept : AIC=inf, Time=48.83 sec ARIMA(0,0,0)(2,1,0)[12] intercept : AIC=47275.548, Time=30.04 sec ARIMA(1,0,1)(2,1,0)[12] intercept : AIC=36942.459, Time=77.41 sec ARIMA(1,0,1)(1,1,0)[12] intercept : AIC=39674.235, Time=22.93 sec ARIMA(1,0,1)(2,1,1)[12] intercept : AIC=inf, Time=252.64 sec ARIMA(1,0,1)(1,1,1)[12] intercept : AIC=inf, Time=62.64 sec ARIMA(0,0,1)(2,1,0)[12] intercept : AIC=37419.564, Time=67.47 sec ARIMA(1,0,1)(2,1,0)[12] : AIC=36940.459, Time=20.95 sec ARIMA(1,0,1)(1,1,0)[12] : AIC=39672.235, Time=8.69 sec ARIMA(1,0,1)(2,1,1)[12] : AIC=inf, Time=41.12 sec ARIMA(1,0,1)(1,1,1)[12] : AIC=inf, Time=18.17 sec ARIMA(0,0,1)(2,1,0)[12] : AIC=37417.564, Time=20.09 sec ARIMA(1,0,0)(2,1,0)[12] : AIC=37726.162, Time=22.88 sec ARIMA(0,0,0)(2,1,0)[12] : AIC=47273.548, Time=24.97 sec Best model: ARIMA(1,0,1)(2,1,0)[12] Total fit time: 1117.949 seconds
# Forecast
n_periods = 12
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(df_k_hourly.index[-1], periods = n_periods, freq='MS')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
plt.plot(df_k_hourly)
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.15)
plt.title("SARIMA - Forecast of temperature change")
plt.show()
df_k_daily_copy = df_k_daily
ChangeDatasetForNeural(df_k_daily_copy)
df_k_daily_copy = df_k_daily_copy.rename(columns={"Temp": "y"})
df_k_daily_copy
| y | ds | |
|---|---|---|
| EventDt | ||
| 2018-11-18 | -26.699659 | 2018-11-18 |
| 2018-11-19 | -26.833437 | 2018-11-19 |
| 2018-11-20 | -26.843924 | 2018-11-20 |
| 2018-11-21 | -26.860833 | 2018-11-21 |
| 2018-11-22 | -26.911181 | 2018-11-22 |
| ... | ... | ... |
| 2021-11-14 | -26.610556 | 2021-11-14 |
| 2021-11-15 | -26.636389 | 2021-11-15 |
| 2021-11-16 | -26.782951 | 2021-11-16 |
| 2021-11-17 | -26.773368 | 2021-11-17 |
| 2021-11-18 | -26.819950 | 2021-11-18 |
1097 rows × 2 columns
warnings.filterwarnings('ignore')
model = NeuralProphet(yearly_seasonality=True,
epochs=30
)
result = model.fit(df_k_daily_copy, freq="D")
result
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.909% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.utils.set_auto_seasonalities) - Disabling daily seasonality. Run NeuralProphet with daily_seasonality=True to override this. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.84E-01, min: 3.56E-01
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.56E-01, min: 3.56E-01 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.42E-01 Epoch[30/30]: 100%|████████████| 30/30 [00:02<00:00, 13.16it/s, SmoothL1Loss=0.00469, MAE=0.273, RMSE=0.375, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 2.175944 | 11.166567 | 12.335355 | 0.0 |
| 1 | 1.777679 | 9.461046 | 10.568139 | 0.0 |
| 2 | 0.718727 | 4.698974 | 5.540011 | 0.0 |
| 3 | 0.074460 | 1.308047 | 1.597903 | 0.0 |
| 4 | 0.029180 | 0.817408 | 0.992852 | 0.0 |
| 5 | 0.011256 | 0.460837 | 0.609251 | 0.0 |
| 6 | 0.008134 | 0.393616 | 0.514549 | 0.0 |
| 7 | 0.008096 | 0.384295 | 0.517376 | 0.0 |
| 8 | 0.007787 | 0.383980 | 0.507926 | 0.0 |
| 9 | 0.008265 | 0.388095 | 0.522196 | 0.0 |
| 10 | 0.007884 | 0.381148 | 0.511352 | 0.0 |
| 11 | 0.007752 | 0.372329 | 0.498515 | 0.0 |
| 12 | 0.010121 | 0.414069 | 0.567778 | 0.0 |
| 13 | 0.007537 | 0.369763 | 0.497342 | 0.0 |
| 14 | 0.007628 | 0.376905 | 0.501520 | 0.0 |
| 15 | 0.007771 | 0.378796 | 0.505825 | 0.0 |
| 16 | 0.007879 | 0.373420 | 0.506980 | 0.0 |
| 17 | 0.006509 | 0.338613 | 0.455128 | 0.0 |
| 18 | 0.006936 | 0.354342 | 0.473835 | 0.0 |
| 19 | 0.005993 | 0.321226 | 0.435563 | 0.0 |
| 20 | 0.005899 | 0.318825 | 0.434054 | 0.0 |
| 21 | 0.005534 | 0.308367 | 0.417041 | 0.0 |
| 22 | 0.005475 | 0.304162 | 0.411853 | 0.0 |
| 23 | 0.005134 | 0.289447 | 0.397398 | 0.0 |
| 24 | 0.005133 | 0.288679 | 0.398227 | 0.0 |
| 25 | 0.005072 | 0.282666 | 0.391177 | 0.0 |
| 26 | 0.005154 | 0.294898 | 0.403838 | 0.0 |
| 27 | 0.004831 | 0.274834 | 0.386008 | 0.0 |
| 28 | 0.004720 | 0.273118 | 0.380040 | 0.0 |
| 29 | 0.004690 | 0.273154 | 0.375458 | 0.0 |
future = model.make_future_dataframe(df_k_daily_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.909% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
forecast = model.predict(future)
fig1 = model.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.932% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.932% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()
df_k_copy = df_k.copy()
df_k_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:40:00 | -25.93 |
| 2018-11-18 16:45:00 | -26.14 |
| 2018-11-18 16:50:00 | -26.44 |
| 2018-11-18 16:55:00 | -26.80 |
| 2018-11-18 17:00:00 | -27.04 |
| ... | ... |
| 2021-11-18 16:15:00 | -26.95 |
| 2021-11-18 16:20:00 | -26.76 |
| 2021-11-18 16:25:00 | -26.54 |
| 2021-11-18 16:30:00 | -26.54 |
| 2021-11-18 16:35:00 | -26.77 |
315648 rows × 1 columns
ChangeDatasetForNeural(df_k_copy)
df_k_copy = df_k_copy.rename(columns={"Temp": "y"})
df_k_copy
| y | ds | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:40:00 | -25.93 | 2018-11-18 16:40:00 |
| 2018-11-18 16:45:00 | -26.14 | 2018-11-18 16:45:00 |
| 2018-11-18 16:50:00 | -26.44 | 2018-11-18 16:50:00 |
| 2018-11-18 16:55:00 | -26.80 | 2018-11-18 16:55:00 |
| 2018-11-18 17:00:00 | -27.04 | 2018-11-18 17:00:00 |
| ... | ... | ... |
| 2021-11-18 16:15:00 | -26.95 | 2021-11-18 16:15:00 |
| 2021-11-18 16:20:00 | -26.76 | 2021-11-18 16:20:00 |
| 2021-11-18 16:25:00 | -26.54 | 2021-11-18 16:25:00 |
| 2021-11-18 16:30:00 | -26.54 | 2021-11-18 16:30:00 |
| 2021-11-18 16:35:00 | -26.77 | 2021-11-18 16:35:00 |
315648 rows × 2 columns
warnings.filterwarnings('ignore')
model = NeuralProphet(yearly_seasonality=True,
epochs=30
)
result = model.fit(df_k_copy, freq="D")
result
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 100.0% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.forecaster.__handle_missing_data) - dropped 59 NAN row in 'y' INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 128
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 6.40E-02, min: 1.62E+00
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.73E-02, min: 2.53E+00 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 7.16E-02 Epoch[30/30]: 100%|█████████████| 30/30 [03:05<00:00, 6.18s/it, SmoothL1Loss=0.0077, MAE=0.457, RMSE=0.627, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 0.272628 | 2.594405 | 3.170851 | 0.0 |
| 1 | 0.009270 | 0.511330 | 0.692476 | 0.0 |
| 2 | 0.008690 | 0.490169 | 0.669138 | 0.0 |
| 3 | 0.009116 | 0.501481 | 0.685797 | 0.0 |
| 4 | 0.009822 | 0.522697 | 0.712154 | 0.0 |
| 5 | 0.010420 | 0.540624 | 0.734861 | 0.0 |
| 6 | 0.011117 | 0.560126 | 0.758920 | 0.0 |
| 7 | 0.011370 | 0.567653 | 0.768319 | 0.0 |
| 8 | 0.011728 | 0.577634 | 0.779639 | 0.0 |
| 9 | 0.011651 | 0.575437 | 0.777522 | 0.0 |
| 10 | 0.011610 | 0.573951 | 0.775849 | 0.0 |
| 11 | 0.011482 | 0.570793 | 0.771233 | 0.0 |
| 12 | 0.011320 | 0.566193 | 0.765746 | 0.0 |
| 13 | 0.011108 | 0.560301 | 0.759164 | 0.0 |
| 14 | 0.010957 | 0.555529 | 0.753542 | 0.0 |
| 15 | 0.010620 | 0.545747 | 0.741413 | 0.0 |
| 16 | 0.010480 | 0.542662 | 0.736384 | 0.0 |
| 17 | 0.010131 | 0.531323 | 0.723959 | 0.0 |
| 18 | 0.009883 | 0.524671 | 0.713654 | 0.0 |
| 19 | 0.009536 | 0.514228 | 0.701995 | 0.0 |
| 20 | 0.009261 | 0.506143 | 0.691517 | 0.0 |
| 21 | 0.009030 | 0.499022 | 0.682790 | 0.0 |
| 22 | 0.008810 | 0.492240 | 0.673723 | 0.0 |
| 23 | 0.008513 | 0.483195 | 0.661659 | 0.0 |
| 24 | 0.008321 | 0.476702 | 0.654203 | 0.0 |
| 25 | 0.008142 | 0.471296 | 0.646399 | 0.0 |
| 26 | 0.007980 | 0.466310 | 0.640164 | 0.0 |
| 27 | 0.007841 | 0.461896 | 0.634272 | 0.0 |
| 28 | 0.007748 | 0.459070 | 0.629984 | 0.0 |
| 29 | 0.007700 | 0.457429 | 0.627387 | 0.0 |
future = model.make_future_dataframe(df_k_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 100.0% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
forecast = model.predict(future)
fig1 = model.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.884% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.884% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()
df_s_daily
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 | 1.119767 |
| 2018-11-19 | 1.457917 |
| 2018-11-20 | 1.675347 |
| 2018-11-21 | 1.848229 |
| 2018-11-22 | 1.921597 |
| ... | ... |
| 2021-11-14 | 1.146146 |
| 2021-11-15 | 1.027743 |
| 2021-11-16 | 1.436840 |
| 2021-11-17 | 1.883576 |
| 2021-11-18 | 1.571294 |
1097 rows × 1 columns
plt.rc('figure',figsize=(14,8))
plt.rc('font',size=15)
result = seasonal_decompose(df_s_daily,model='additive')
fig = result.plot()
model = sm.tsa.statespace.SARIMAX(df_s_daily,
order=(1,1,1),
seasonal_order=(1,1,0,12))
sarima = model.fit()
predictions = sarima.predict()
forecast = sarima.forecast(100)
ax = df_s_daily.plot(label='observed', figsize=(20, 15))
forecast.plot(ax=ax, label='Forecast')
ax.set_xlabel('Date')
ax.set_ylabel('Temperature in celsius')
plt.legend()
plt.show()
# Seasonal - fit stepwise auto-ARIMA
smodel = pm.auto_arima(df_s_daily, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=True,
d=None, D=1, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
Performing stepwise search to minimize aic ARIMA(1,0,1)(0,1,1)[12] intercept : AIC=inf, Time=3.63 sec ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=2103.867, Time=0.27 sec ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=1395.603, Time=0.90 sec ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=inf, Time=2.44 sec ARIMA(0,0,0)(0,1,0)[12] : AIC=2101.869, Time=0.06 sec ARIMA(1,0,0)(0,1,0)[12] intercept : AIC=1642.911, Time=0.26 sec ARIMA(1,0,0)(2,1,0)[12] intercept : AIC=1261.711, Time=2.54 sec ARIMA(1,0,0)(2,1,1)[12] intercept : AIC=inf, Time=9.17 sec ARIMA(1,0,0)(1,1,1)[12] intercept : AIC=inf, Time=4.05 sec ARIMA(0,0,0)(2,1,0)[12] intercept : AIC=1807.304, Time=3.26 sec ARIMA(2,0,0)(2,1,0)[12] intercept : AIC=1263.513, Time=2.84 sec ARIMA(1,0,1)(2,1,0)[12] intercept : AIC=1263.498, Time=3.39 sec ARIMA(0,0,1)(2,1,0)[12] intercept : AIC=1425.932, Time=2.06 sec ARIMA(2,0,1)(2,1,0)[12] intercept : AIC=1262.776, Time=9.73 sec ARIMA(1,0,0)(2,1,0)[12] : AIC=1259.730, Time=0.87 sec ARIMA(1,0,0)(1,1,0)[12] : AIC=1393.606, Time=0.38 sec ARIMA(1,0,0)(2,1,1)[12] : AIC=inf, Time=5.95 sec ARIMA(1,0,0)(1,1,1)[12] : AIC=inf, Time=1.94 sec ARIMA(0,0,0)(2,1,0)[12] : AIC=1805.377, Time=0.38 sec ARIMA(2,0,0)(2,1,0)[12] : AIC=1261.532, Time=1.02 sec ARIMA(1,0,1)(2,1,0)[12] : AIC=1261.517, Time=1.10 sec ARIMA(0,0,1)(2,1,0)[12] : AIC=1423.979, Time=0.69 sec ARIMA(2,0,1)(2,1,0)[12] : AIC=1260.787, Time=2.86 sec Best model: ARIMA(1,0,0)(2,1,0)[12] Total fit time: 59.829 seconds
# Forecast
n_periods = 24
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(df_s_daily.index[-1], periods = n_periods, freq='MS')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
plt.plot(df_s_daily)
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.15)
plt.title("SARIMA - Forecast of temperature change")
plt.show()
df_s_daily_copy = df_s_daily
ChangeDatasetForNeural(df_s_daily_copy)
df_s_daily_copy = df_s_daily_copy.rename(columns={"Temp": "y"})
df_s_daily_copy
| y | ds | |
|---|---|---|
| EventDt | ||
| 2018-11-18 | 1.119767 | 2018-11-18 |
| 2018-11-19 | 1.457917 | 2018-11-19 |
| 2018-11-20 | 1.675347 | 2018-11-20 |
| 2018-11-21 | 1.848229 | 2018-11-21 |
| 2018-11-22 | 1.921597 | 2018-11-22 |
| ... | ... | ... |
| 2021-11-14 | 1.146146 | 2021-11-14 |
| 2021-11-15 | 1.027743 | 2021-11-15 |
| 2021-11-16 | 1.436840 | 2021-11-16 |
| 2021-11-17 | 1.883576 | 2021-11-17 |
| 2021-11-18 | 1.571294 | 2021-11-18 |
1097 rows × 2 columns
warnings.filterwarnings('ignore')
model = NeuralProphet(yearly_seasonality=True,
epochs=100
)
result = model.fit(df_s_daily_copy, freq="D")
result
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.909% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.utils.set_auto_seasonalities) - Disabling daily seasonality. Run NeuralProphet with daily_seasonality=True to override this. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.75E-02, min: 2.17E-01
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.75E-02, min: 8.17E-01 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 6.92E-02 Epoch[100/100]: 100%|█████████| 100/100 [00:07<00:00, 14.07it/s, SmoothL1Loss=0.0206, MAE=0.365, RMSE=0.445, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 0.284581 | 1.404975 | 1.712114 | 0.0 |
| 1 | 0.249011 | 1.301416 | 1.590657 | 0.0 |
| 2 | 0.201140 | 1.149792 | 1.417960 | 0.0 |
| 3 | 0.141510 | 0.942914 | 1.175256 | 0.0 |
| 4 | 0.083857 | 0.716013 | 0.892040 | 0.0 |
| ... | ... | ... | ... | ... |
| 95 | 0.020742 | 0.365679 | 0.446185 | 0.0 |
| 96 | 0.020677 | 0.365334 | 0.446118 | 0.0 |
| 97 | 0.020667 | 0.365500 | 0.445414 | 0.0 |
| 98 | 0.020651 | 0.365309 | 0.445275 | 0.0 |
| 99 | 0.020643 | 0.365193 | 0.445419 | 0.0 |
100 rows × 4 columns
future = model.make_future_dataframe(df_s_daily_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.909% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
forecast = model.predict(future)
fig1 = model.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.932% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.932% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()
df_d_daily
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 | 17.452258 |
| 2018-11-19 | 16.783854 |
| 2018-11-20 | 15.723333 |
| 2018-11-21 | 16.444618 |
| 2018-11-22 | 16.489757 |
| ... | ... |
| 2021-11-14 | 16.897604 |
| 2021-11-15 | 19.999410 |
| 2021-11-16 | 20.337917 |
| 2021-11-17 | 19.501250 |
| 2021-11-18 | 20.134000 |
1097 rows × 1 columns
df_d_daily.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1097 entries, 2018-11-18 to 2021-11-18 Freq: D Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 1091 non-null float64 dtypes: float64(1) memory usage: 17.1 KB
df_d_daily_copy = df_d_daily.copy()
df_d_daily_copy = df_d_daily_copy.dropna()
df_d_daily_copy.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1091 entries, 2018-11-18 to 2021-11-18 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 1091 non-null float64 dtypes: float64(1) memory usage: 17.0 KB
# Seasonal - fit stepwise auto-ARIMA
smodel = pm.auto_arima(df_d_daily_copy, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=True,
d=None, D=1, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
Performing stepwise search to minimize aic ARIMA(1,0,1)(0,1,1)[12] intercept : AIC=inf, Time=3.56 sec ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=5590.480, Time=0.05 sec ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=4917.078, Time=0.53 sec ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=4595.859, Time=0.66 sec ARIMA(0,0,0)(0,1,0)[12] : AIC=5588.675, Time=0.05 sec ARIMA(0,0,1)(0,1,0)[12] intercept : AIC=5055.247, Time=0.23 sec ARIMA(0,0,1)(1,1,1)[12] intercept : AIC=4592.760, Time=0.96 sec ARIMA(0,0,1)(1,1,0)[12] intercept : AIC=4768.774, Time=0.54 sec ARIMA(0,0,1)(2,1,1)[12] intercept : AIC=4576.798, Time=2.29 sec ARIMA(0,0,1)(2,1,0)[12] intercept : AIC=4607.602, Time=1.86 sec ARIMA(0,0,1)(2,1,2)[12] intercept : AIC=4577.383, Time=3.62 sec ARIMA(0,0,1)(1,1,2)[12] intercept : AIC=4589.888, Time=4.17 sec ARIMA(0,0,0)(2,1,1)[12] intercept : AIC=5111.374, Time=1.78 sec ARIMA(1,0,1)(2,1,1)[12] intercept : AIC=inf, Time=10.77 sec ARIMA(0,0,2)(2,1,1)[12] intercept : AIC=4519.239, Time=2.82 sec ARIMA(0,0,2)(1,1,1)[12] intercept : AIC=4518.379, Time=1.16 sec ARIMA(0,0,2)(0,1,1)[12] intercept : AIC=4528.261, Time=0.91 sec ARIMA(0,0,2)(1,1,0)[12] intercept : AIC=4716.920, Time=0.65 sec ARIMA(0,0,2)(1,1,2)[12] intercept : AIC=4520.003, Time=3.94 sec ARIMA(0,0,2)(0,1,0)[12] intercept : AIC=5051.985, Time=0.35 sec ARIMA(0,0,2)(0,1,2)[12] intercept : AIC=4518.139, Time=2.55 sec ARIMA(0,0,1)(0,1,2)[12] intercept : AIC=4590.511, Time=1.55 sec ARIMA(1,0,2)(0,1,2)[12] intercept : AIC=inf, Time=11.77 sec ARIMA(0,0,3)(0,1,2)[12] intercept : AIC=4497.787, Time=2.64 sec ARIMA(0,0,3)(0,1,1)[12] intercept : AIC=4515.164, Time=1.07 sec ARIMA(0,0,3)(1,1,2)[12] intercept : AIC=4499.140, Time=3.96 sec ARIMA(0,0,3)(1,1,1)[12] intercept : AIC=4497.539, Time=1.53 sec ARIMA(0,0,3)(1,1,0)[12] intercept : AIC=4717.635, Time=1.08 sec ARIMA(0,0,3)(2,1,1)[12] intercept : AIC=4498.320, Time=3.94 sec ARIMA(0,0,3)(0,1,0)[12] intercept : AIC=5035.327, Time=0.47 sec ARIMA(0,0,3)(2,1,0)[12] intercept : AIC=4562.895, Time=3.41 sec ARIMA(0,0,3)(2,1,2)[12] intercept : AIC=inf, Time=10.18 sec ARIMA(1,0,3)(1,1,1)[12] intercept : AIC=inf, Time=5.92 sec ARIMA(1,0,2)(1,1,1)[12] intercept : AIC=inf, Time=5.81 sec ARIMA(0,0,3)(1,1,1)[12] : AIC=4497.097, Time=0.86 sec ARIMA(0,0,3)(0,1,1)[12] : AIC=4516.029, Time=0.70 sec ARIMA(0,0,3)(1,1,0)[12] : AIC=4715.815, Time=0.46 sec ARIMA(0,0,3)(2,1,1)[12] : AIC=4497.474, Time=1.96 sec ARIMA(0,0,3)(1,1,2)[12] : AIC=4498.491, Time=2.41 sec ARIMA(0,0,3)(0,1,0)[12] : AIC=5033.448, Time=0.21 sec ARIMA(0,0,3)(0,1,2)[12] : AIC=4496.996, Time=1.24 sec ARIMA(0,0,2)(0,1,2)[12] : AIC=4517.390, Time=1.09 sec ARIMA(1,0,3)(0,1,2)[12] : AIC=inf, Time=6.41 sec ARIMA(1,0,2)(0,1,2)[12] : AIC=inf, Time=8.71 sec Best model: ARIMA(0,0,3)(0,1,2)[12] Total fit time: 120.865 seconds
# Forecast
n_periods = 24
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(df_d_daily_copy.index[-1], periods = n_periods, freq='MS')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
plt.plot(df_d_daily_copy)
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.15)
plt.title("SARIMA - Forecast of temperature change")
plt.show()
df_d_daily_copy = df_d_daily
ChangeDatasetForNeural(df_d_daily_copy)
df_d_daily_copy = df_d_daily_copy.rename(columns={"Temp": "y"})
df_d_daily_copy = df_d_daily_copy.dropna()
df_d_daily_copy.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1091 entries, 2018-11-18 to 2021-11-18 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 y 1091 non-null float64 1 ds 1091 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1) memory usage: 25.6 KB
warnings.filterwarnings('ignore')
model = NeuralProphet(yearly_seasonality=True,
epochs=30
)
result = model.fit(df_d_daily_copy, freq="D")
result
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.817% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.utils.set_auto_seasonalities) - Disabling daily seasonality. Run NeuralProphet with daily_seasonality=True to override this. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.12E-01, min: 1.34E+00
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.12E-01, min: 1.14E+00 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.08E-01 Epoch[30/30]: 100%|██████████████| 30/30 [00:02<00:00, 14.23it/s, SmoothL1Loss=0.00475, MAE=1.14, RMSE=1.48, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 0.627900 | 15.859956 | 19.434648 | 0.0 |
| 1 | 0.420125 | 12.226717 | 14.982056 | 0.0 |
| 2 | 0.105582 | 5.540872 | 6.777204 | 0.0 |
| 3 | 0.024386 | 2.697784 | 3.306753 | 0.0 |
| 4 | 0.007332 | 1.453510 | 1.839402 | 0.0 |
| 5 | 0.006343 | 1.348739 | 1.710686 | 0.0 |
| 6 | 0.006465 | 1.368926 | 1.725668 | 0.0 |
| 7 | 0.007536 | 1.487490 | 1.851824 | 0.0 |
| 8 | 0.006569 | 1.373333 | 1.739725 | 0.0 |
| 9 | 0.007218 | 1.443493 | 1.821650 | 0.0 |
| 10 | 0.008125 | 1.533569 | 1.925610 | 0.0 |
| 11 | 0.008091 | 1.533028 | 1.929926 | 0.0 |
| 12 | 0.008548 | 1.548451 | 1.973753 | 0.0 |
| 13 | 0.006817 | 1.380341 | 1.763322 | 0.0 |
| 14 | 0.006838 | 1.409318 | 1.774377 | 0.0 |
| 15 | 0.008264 | 1.519089 | 1.929256 | 0.0 |
| 16 | 0.007711 | 1.501538 | 1.885602 | 0.0 |
| 17 | 0.006974 | 1.414332 | 1.794249 | 0.0 |
| 18 | 0.006101 | 1.322489 | 1.679469 | 0.0 |
| 19 | 0.005889 | 1.292348 | 1.649151 | 0.0 |
| 20 | 0.006425 | 1.360726 | 1.722425 | 0.0 |
| 21 | 0.005606 | 1.245051 | 1.596938 | 0.0 |
| 22 | 0.005928 | 1.306999 | 1.657137 | 0.0 |
| 23 | 0.005675 | 1.263278 | 1.620765 | 0.0 |
| 24 | 0.005277 | 1.227014 | 1.561906 | 0.0 |
| 25 | 0.005170 | 1.183020 | 1.546774 | 0.0 |
| 26 | 0.004977 | 1.180443 | 1.514147 | 0.0 |
| 27 | 0.004825 | 1.143463 | 1.486214 | 0.0 |
| 28 | 0.004773 | 1.142215 | 1.481845 | 0.0 |
| 29 | 0.004748 | 1.139029 | 1.475434 | 0.0 |
future = model.make_future_dataframe(df_d_daily_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.817% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
forecast = model.predict(future)
fig1 = model.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.863% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.863% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()
df_d_copy = df_d.copy()
df_d_copy = df_d_copy.dropna()
df_d_copy
| Temp | |
|---|---|
| EventDt | |
| 2018-11-18 16:15:00 | 15.16 |
| 2018-11-18 16:20:00 | 15.03 |
| 2018-11-18 16:25:00 | 14.88 |
| 2018-11-18 16:30:00 | 14.82 |
| 2018-11-18 16:35:00 | 15.96 |
| ... | ... |
| 2021-11-18 15:50:00 | 21.53 |
| 2021-11-18 15:55:00 | 21.47 |
| 2021-11-18 16:00:00 | 21.62 |
| 2021-11-18 16:05:00 | 21.58 |
| 2021-11-18 16:10:00 | 21.66 |
313427 rows × 1 columns
ChangeDatasetForNeural(df_d_copy)
df_d_copy = df_d_copy.rename(columns={"Temp": "y"})
df_d_copy
| y | ds | |
|---|---|---|
| EventDt | ||
| 2018-11-18 16:15:00 | 15.16 | 2018-11-18 16:15:00 |
| 2018-11-18 16:20:00 | 15.03 | 2018-11-18 16:20:00 |
| 2018-11-18 16:25:00 | 14.88 | 2018-11-18 16:25:00 |
| 2018-11-18 16:30:00 | 14.82 | 2018-11-18 16:30:00 |
| 2018-11-18 16:35:00 | 15.96 | 2018-11-18 16:35:00 |
| ... | ... | ... |
| 2021-11-18 15:50:00 | 21.53 | 2021-11-18 15:50:00 |
| 2021-11-18 15:55:00 | 21.47 | 2021-11-18 15:55:00 |
| 2021-11-18 16:00:00 | 21.62 | 2021-11-18 16:00:00 |
| 2021-11-18 16:05:00 | 21.58 | 2021-11-18 16:05:00 |
| 2021-11-18 16:10:00 | 21.66 | 2021-11-18 16:10:00 |
313427 rows × 2 columns
warnings.filterwarnings('ignore')
model = NeuralProphet(yearly_seasonality=True,
epochs=30
)
result = model.fit(df_d_copy, freq="D")
result
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.968% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 128
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.05E-01, min: 1.04E+00
INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.05E-01, min: 1.16E+00 INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 2.10E-01 Epoch[30/30]: 100%|██████████████| 30/30 [03:02<00:00, 6.10s/it, SmoothL1Loss=0.00794, MAE=1.75, RMSE=2.34, RegLoss=0]
| SmoothL1Loss | MAE | RMSE | RegLoss | |
|---|---|---|---|---|
| 0 | 0.068052 | 3.949855 | 5.068529 | 0.0 |
| 1 | 0.008855 | 1.864587 | 2.468367 | 0.0 |
| 2 | 0.010319 | 2.045449 | 2.663936 | 0.0 |
| 3 | 0.012117 | 2.238181 | 2.886144 | 0.0 |
| 4 | 0.014356 | 2.456709 | 3.138546 | 0.0 |
| 5 | 0.017062 | 2.690398 | 3.417684 | 0.0 |
| 6 | 0.019858 | 2.913022 | 3.683522 | 0.0 |
| 7 | 0.021716 | 3.050347 | 3.846931 | 0.0 |
| 8 | 0.022652 | 3.117826 | 3.928163 | 0.0 |
| 9 | 0.022851 | 3.130324 | 3.945856 | 0.0 |
| 10 | 0.022791 | 3.125246 | 3.940424 | 0.0 |
| 11 | 0.022095 | 3.072392 | 3.878075 | 0.0 |
| 12 | 0.021085 | 3.001503 | 3.792021 | 0.0 |
| 13 | 0.020499 | 2.960004 | 3.740847 | 0.0 |
| 14 | 0.018984 | 2.844745 | 3.598712 | 0.0 |
| 15 | 0.018522 | 2.803656 | 3.557704 | 0.0 |
| 16 | 0.017143 | 2.698112 | 3.427483 | 0.0 |
| 17 | 0.015928 | 2.594456 | 3.303041 | 0.0 |
| 18 | 0.014667 | 2.484696 | 3.172416 | 0.0 |
| 19 | 0.013713 | 2.395804 | 3.067928 | 0.0 |
| 20 | 0.013035 | 2.333928 | 2.993250 | 0.0 |
| 21 | 0.011964 | 2.220181 | 2.867847 | 0.0 |
| 22 | 0.011207 | 2.142770 | 2.776952 | 0.0 |
| 23 | 0.010397 | 2.054251 | 2.674830 | 0.0 |
| 24 | 0.009649 | 1.968042 | 2.577787 | 0.0 |
| 25 | 0.009202 | 1.912802 | 2.517100 | 0.0 |
| 26 | 0.008699 | 1.848846 | 2.446576 | 0.0 |
| 27 | 0.008358 | 1.803105 | 2.398586 | 0.0 |
| 28 | 0.008084 | 1.765734 | 2.358481 | 0.0 |
| 29 | 0.007940 | 1.745864 | 2.337590 | 0.0 |
future = model.make_future_dataframe(df_d_copy, periods=365, n_historic_predictions=True)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.968% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
forecast = model.predict(future)
fig1 = model.plot(forecast)
INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.852% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T INFO - (NP.df_utils._infer_frequency) - Major frequency 5T corresponds to 99.852% of the data. WARNING - (NP.df_utils._infer_frequency) - Defined frequency D is different than major frequency 5T
fig, ax = plt.subplots()
ax.plot(forecast["ds"],forecast["y"], label="Data")
ax.plot(forecast["ds"],forecast["yhat1"],label="Predicted Data")
ax.legend(loc = 'upper right')
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()